GzipPipelineCachedStorage from fnpdjango.

[wolnelektury.git] / apps / search / index.py
diff --git a/apps/search/index.py b/apps/search/index.py

index 33836ad..7fb60b5 100644 (file)
--- a/apps/search/index.py
+++ b/apps/search/index.py
@@ -1,25 +1,9 @@
  # -*- coding: utf-8 -*-
  # -*- coding: utf-8 -*-
-
+# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+#
  from django.conf import settings
  from django.conf import settings
-from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
-    File, Field, Integer, \
-    NumericField, Version, Document, JavaError, IndexSearcher, \
-    QueryParser, PerFieldAnalyzerWrapper, \
-    SimpleAnalyzer, PolishAnalyzer, ArrayList, \
-    KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
-    BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
-    HashSet, BooleanClause, Term, CharTermAttribute, \
-    PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
-    FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
-    SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
-    BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
-    initVM, CLASSPATH, JArray, JavaError
-    # KeywordAnalyzer
-
-# Initialize jvm
-JVM = initVM(CLASSPATH)
-
-import sys
+
  import os
  import re
  import errno
  import os
  import re
  import errno
@@ -27,79 +11,20 @@ from librarian import dcparser
  from librarian.parser import WLDocument
  from lxml import etree
  import catalogue.models
  from librarian.parser import WLDocument
  from lxml import etree
  import catalogue.models
-from multiprocessing.pool import ThreadPool
-from threading import current_thread
-import atexit
+from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
+from itertools import chain
  import traceback
  import traceback
+import logging
+log = logging.getLogger('search')
+import sunburnt
+import custom
+import operator
  
  
+log = logging.getLogger('search')
  
  
-class WLAnalyzer(PerFieldAnalyzerWrapper):
-    def __init__(self):
-        polish = PolishAnalyzer(Version.LUCENE_34)
-        #        polish_gap.setPositionIncrementGap(999)
-
-        simple = SimpleAnalyzer(Version.LUCENE_34)
-        #        simple_gap.setPositionIncrementGap(999)
-
-        keyword = KeywordAnalyzer(Version.LUCENE_34)
-
-        # not sure if needed: there's NOT_ANALYZED meaning basically the same
-
-        PerFieldAnalyzerWrapper.__init__(self, polish)
-
-        self.addAnalyzer("tags", simple)
-        self.addAnalyzer("technical_editors", simple)
-        self.addAnalyzer("editors", simple)
-        self.addAnalyzer("url", keyword)
-        self.addAnalyzer("source_url", keyword)
-        self.addAnalyzer("source_name", simple)
-        self.addAnalyzer("publisher", simple)
-        self.addAnalyzer("authors", simple)
-        self.addAnalyzer("title", simple)
-
-        self.addAnalyzer("is_book", keyword)
-        # shouldn't the title have two forms? _pl and simple?
-
-        self.addAnalyzer("themes", simple)
-        self.addAnalyzer("themes_pl", polish)
-
-        self.addAnalyzer("tag_name", simple)
-        self.addAnalyzer("tag_name_pl", polish)
-
-        self.addAnalyzer("translators", simple)
-
-        self.addAnalyzer("KEYWORD", keyword)
-        self.addAnalyzer("SIMPLE", simple)
-        self.addAnalyzer("POLISH", polish)
-
-
-class IndexStore(object):
-    """
-    Provides access to search index.
-
-    self.store - lucene index directory
-    """
-    def __init__(self):
-        self.make_index_dir()
-        self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
-
-    def make_index_dir(self):
-        try:
-            os.makedirs(settings.SEARCH_INDEX)
-        except OSError as exc:
-            if exc.errno == errno.EEXIST:
-                pass
-            else: raise
-
-
-class IndexChecker(IndexStore):
-    def __init__(self):
-        IndexStore.__init__(self)
-
-    def check(self):
-        checker = CheckIndex(self.store)
-        status = checker.checkIndex()
-        return status
+class SolrIndex(object):
+    def __init__(self, mode=None):
+        self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  
  
  class Snippets(object):
  
  
  class Snippets(object):
@@ -110,7 +35,7 @@ class Snippets(object):
      """
      SNIPPET_DIR = "snippets"
  
      """
      SNIPPET_DIR = "snippets"
  
-    def __init__(self, book_id):
+    def __init__(self, book_id, revision=None):
          try:
              os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
          except OSError as exc:
          try:
              os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
          except OSError as exc:
@@ -118,15 +43,32 @@ class Snippets(object):
                  pass
              else: raise
          self.book_id = book_id
                  pass
              else: raise
          self.book_id = book_id
+        self.revision = revision
          self.file = None
  
          self.file = None
  
+    @property
+    def path(self):
+        if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
+        else: fn = "%d" % self.book_id
+
+        return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
+
      def open(self, mode='r'):
          """
          Open the snippet file. Call .close() afterwards.
          """
          if not 'b' in mode:
              mode += 'b'
      def open(self, mode='r'):
          """
          Open the snippet file. Call .close() afterwards.
          """
          if not 'b' in mode:
              mode += 'b'
-        self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
+
+        if 'w' in mode:
+            if os.path.exists(self.path):
+                self.revision = 1
+                while True:
+                    if not os.path.exists(self.path):
+                        break
+                    self.revision += 1
+
+        self.file = open(self.path, mode)
          self.position = 0
          return self
  
          self.position = 0
          return self
  
@@ -153,87 +95,142 @@ class Snippets(object):
  
      def close(self):
          """Close snippet file"""
  
      def close(self):
          """Close snippet file"""
-        self.file.close()
-
-
-class BaseIndex(IndexStore):
-    """
-    Base index class.
-    Provides basic operations on index: opening, closing, optimizing.
-    """
-    def __init__(self, analyzer=None):
-        super(BaseIndex, self).__init__()
-        self.index = None
-        if not analyzer:
-            analyzer = WLAnalyzer()
-        self.analyzer = analyzer
-
-    def open(self, analyzer=None):
-        if self.index:
-            raise Exception("Index is already opened")
-        self.index = IndexWriter(self.store, self.analyzer,\
-                                 IndexWriter.MaxFieldLength.LIMITED)
-        return self.index
-
-    def optimize(self):
-        self.index.optimize()
+        if self.file:
+            self.file.close()
  
  
-    def close(self):
+    def remove(self):
+        self.revision = None
          try:
          try:
-            self.index.optimize()
-        except JavaError, je:
-            print "Error during optimize phase, check index: %s" % je
-
-        self.index.close()
-        self.index = None
-
-    def __enter__(self):
-        self.open()
-        return self
-
-    def __exit__(self, type, value, tb):
-        self.close()
+            os.unlink(self.path)
+            self.revision = 0
+            while True:
+                self.revision += 1
+                os.unlink(self.path)
+        except OSError:
+            pass
  
  
  
  
-class Index(BaseIndex):
+class Index(SolrIndex):
      """
      Class indexing books.
      """
      """
      Class indexing books.
      """
-    def __init__(self, analyzer=None):
-        super(Index, self).__init__(analyzer)
+    def __init__(self):
+        super(Index, self).__init__(mode='rw')
+
+    def delete_query(self, *queries):
+        """
+        index.delete(queries=...) doesn't work, so let's reimplement it
+        using deletion of list of uids.
+        """
+        uids = set()
+        for q in queries:
+            if isinstance(q, sunburnt.search.LuceneQuery):
+                q = self.index.query(q)
+            q.field_limiter.update(['uid'])
+            st = 0
+            rows = 100
+            while True:
+                ids = q.paginate(start=st, rows=rows).execute()
+                if not len(ids):
+                    break
+                for res in ids:
+                    uids.add(res['uid'])
+                st += rows
+        if uids:
+            self.index.delete(uids)
+            return True
+        else:
+            return False
  
  
-    def index_tags(self):
+    def index_tags(self, *tags, **kw):
          """
          Re-index global tag list.
          Removes all tags from index, then index them again.
          Indexed fields include: id, name (with and without polish stems), category
          """
          """
          Re-index global tag list.
          Removes all tags from index, then index them again.
          Indexed fields include: id, name (with and without polish stems), category
          """
-        q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
-        self.index.deleteDocuments(q)
-
-        for tag in catalogue.models.Tag.objects.all():
-            doc = Document()
-            doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
-            doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
-            doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
-            doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
-            self.index.addDocument(doc)
+        log.debug("Indexing tags")
+        remove_only = kw.get('remove_only', False)
+        # first, remove tags from index.
+        if tags:
+            tag_qs = []
+            for tag in tags:
+                q_id = self.index.Q(tag_id=tag.id)
+
+                if isinstance(tag, PDCounterAuthor):
+                    q_cat = self.index.Q(tag_category='pd_author')
+                elif isinstance(tag, PDCounterBook):
+                    q_cat = self.index.Q(tag_category='pd_book')
+                else:
+                    q_cat = self.index.Q(tag_category=tag.category)
+
+                q_id_cat = self.index.Q(q_id & q_cat)
+                tag_qs.append(q_id_cat)
+            self.delete_query(*tag_qs)
+        else:  # all
+            q = self.index.Q(tag_id__any=True)
+            self.delete_query(q)
+
+        if not remove_only:
+            # then add them [all or just one passed]
+            if not tags:
+                tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
+                    PDCounterAuthor.objects.all(), \
+                    PDCounterBook.objects.all())
+
+            for tag in tags:
+                if isinstance(tag, PDCounterAuthor):
+                    doc = {
+                        "tag_id": int(tag.id),
+                        "tag_name": tag.name,
+                        "tag_name_pl": tag.name,
+                        "tag_category": 'pd_author',
+                        "is_pdcounter": True,
+                        "uid": "tag%d_pd_a" % tag.id
+                        }
+                elif isinstance(tag, PDCounterBook):
+                    doc = {
+                        "tag_id": int(tag.id),
+                        "tag_name": tag.title,
+                        "tag_name_pl": tag.title,
+                        "tag_category": 'pd_book',
+                        "is_pdcounter": True,
+                        "uid": "tag%d_pd_b" % tag.id
+                        }
+                else:
+                    doc = {
+                        "tag_id": int(tag.id),
+                        "tag_name": tag.name,
+                        "tag_name_pl": tag.name,
+                        "tag_category": tag.category,
+                        "is_pdcounter": False,
+                        "uid": "tag%d" % tag.id
+                        }
+                self.index.add(doc)
  
      def create_book_doc(self, book):
          """
          Create a lucene document referring book id.
          """
  
      def create_book_doc(self, book):
          """
          Create a lucene document referring book id.
          """
-        doc = Document()
-        doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
+        doc = {
+            'book_id': int(book.id),
+            }
          if book.parent is not None:
          if book.parent is not None:
-            doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
+            doc["parent_id"] = int(book.parent.id)
          return doc
  
          return doc
  
-    def remove_book(self, book):
+    def remove_book(self, book_or_id, remove_snippets=True):
          """Removes a book from search index.
          book - Book instance."""
          """Removes a book from search index.
          book - Book instance."""
-        q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
-        self.index.deleteDocuments(q)
+        if isinstance(book_or_id, catalogue.models.Book):
+            book_id = book_or_id.id
+        else:
+            book_id = book_or_id
+
+        self.delete_query(self.index.Q(book_id=book_id))
+
+        if remove_snippets:
+            snippets = Snippets(book_id)
+            snippets.remove()
  
      def index_book(self, book, book_info=None, overwrite=True):
          """
  
      def index_book(self, book, book_info=None, overwrite=True):
          """
@@ -242,21 +239,32 @@ class Index(BaseIndex):
          and calls self.index_content() to index the contents of the book.
          """
          if overwrite:
          and calls self.index_content() to index the contents of the book.
          """
          if overwrite:
-            self.remove_book(book)
+            # we don't remove snippets, since they might be still needed by
+            # threads using not reopened index
+            self.remove_book(book, remove_snippets=False)
  
          book_doc = self.create_book_doc(book)
  
          book_doc = self.create_book_doc(book)
-        meta_fields = self.extract_metadata(book, book_info)
-        for f in meta_fields.values():
-            if isinstance(f, list) or isinstance(f, tuple):
-                for elem in f:
-                    book_doc.add(elem)
-            else:
-                book_doc.add(f)
+        meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
+        # let's not index it - it's only used for extracting publish date
+        if 'source_name' in meta_fields:
+            del meta_fields['source_name']
  
  
-        self.index.addDocument(book_doc)
+        for n, f in meta_fields.items():
+            book_doc[n] = f
+
+        book_doc['uid'] = "book%s" % book_doc['book_id']
+        self.index.add(book_doc)
          del book_doc
          del book_doc
+        book_fields = {
+            'title': meta_fields['title'],
+            'authors': meta_fields['authors'],
+            'published_date': meta_fields['published_date']
+            }
+
+        if 'translators' in meta_fields:
+            book_fields['translators'] = meta_fields['translators']
  
  
-        self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
+        self.index_content(book, book_fields=book_fields)
  
      master_tags = [
          'opowiadanie',
  
      master_tags = [
          'opowiadanie',
@@ -276,11 +284,11 @@ class Index(BaseIndex):
  
      footnote_tags = ['pa', 'pt', 'pr', 'pe']
  
  
      footnote_tags = ['pa', 'pt', 'pr', 'pe']
  
-    skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
+    skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
  
      published_date_re = re.compile("([0-9]+)[\]. ]*$")
  
  
      published_date_re = re.compile("([0-9]+)[\]. ]*$")
  
-    def extract_metadata(self, book, book_info=None):
+    def extract_metadata(self, book, book_info=None, dc_only=None):
          """
          Extract metadata from book and returns a map of fields keyed by fieldname
          """
          """
          Extract metadata from book and returns a map of fields keyed by fieldname
          """
@@ -289,12 +297,14 @@ class Index(BaseIndex):
          if book_info is None:
              book_info = dcparser.parse(open(book.xml_file.path))
  
          if book_info is None:
              book_info = dcparser.parse(open(book.xml_file.path))
  
-        fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
-        fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
-        fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
+        fields['slug'] = book.slug
+        fields['tags'] = [t.name  for t in book.tags]
+        fields['is_book'] = True
  
          # validator, name
          for field in dcparser.BookInfo.FIELDS:
  
          # validator, name
          for field in dcparser.BookInfo.FIELDS:
+            if dc_only and field.name not in dc_only:
+                continue
              if hasattr(book_info, field.name):
                  if not getattr(book_info, field.name):
                      continue
              if hasattr(book_info, field.name):
                  if not getattr(book_info, field.name):
                      continue
@@ -304,40 +314,38 @@ class Index(BaseIndex):
                      s = getattr(book_info, field.name)
                      if field.multiple:
                          s = ', '.join(s)
                      s = getattr(book_info, field.name)
                      if field.multiple:
                          s = ', '.join(s)
-                    try:
-                        fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
-                    except JavaError as je:
-                        raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
+                    fields[field.name] = s
                  elif type_indicator == dcparser.as_person:
                      p = getattr(book_info, field.name)
                      if isinstance(p, dcparser.Person):
                          persons = unicode(p)
                      else:
                          persons = ', '.join(map(unicode, p))
                  elif type_indicator == dcparser.as_person:
                      p = getattr(book_info, field.name)
                      if isinstance(p, dcparser.Person):
                          persons = unicode(p)
                      else:
                          persons = ', '.join(map(unicode, p))
-                    fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
+                    fields[field.name] = persons
                  elif type_indicator == dcparser.as_date:
                      dt = getattr(book_info, field.name)
                  elif type_indicator == dcparser.as_date:
                      dt = getattr(book_info, field.name)
-                    fields[field.name] = Field(field.name, "%04d%02d%02d" %\
-                                               (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
+                    fields[field.name] = dt
  
          # get published date
  
          # get published date
-        source = book_info.source_name
-        match = self.published_date_re.search(source)
-        print("published date is %s %s" % (match, match is not None and match.groups()))
-        if match is not None:
-            fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
+        pd = None
+        if hasattr(book_info, 'source_name') and book_info.source_name:
+            match = self.published_date_re.search(book_info.source_name)
+            if match is not None:
+                pd = str(match.groups()[0])
+        if not pd: pd = ""
+        fields["published_date"] = pd
  
          return fields
  
  
          return fields
  
-    def add_gaps(self, fields, fieldname):
-        """
-        Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
-        This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
-        """
-        def gap():
-            while True:
-                yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
-        return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
+    # def add_gaps(self, fields, fieldname):
+    #     """
+    #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
+    #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
+    #     """
+    #     def gap():
+    #         while True:
+    #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
+    #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
  
      def get_master(self, root):
          """
  
      def get_master(self, root):
          """
@@ -347,7 +355,7 @@ class Index(BaseIndex):
              if master.tag in self.master_tags:
                  return master
  
              if master.tag in self.master_tags:
                  return master
  
-    def index_content(self, book, book_fields=[]):
+    def index_content(self, book, book_fields={}):
          """
          Walks the book XML and extract content from it.
          Adds parts for each header tag and for each fragment.
          """
          Walks the book XML and extract content from it.
          Adds parts for each header tag and for each fragment.
@@ -360,11 +368,18 @@ class Index(BaseIndex):
              return []
  
          def walker(node, ignore_tags=[]):
              return []
  
          def walker(node, ignore_tags=[]):
-            yield node, None
-            for child in filter(lambda n: n.tag not in ignore_tags, list(node)):
-                for b, e in walker(child):
-                    yield b, e
-            yield None, node
+
+            if node.tag not in ignore_tags:
+                yield node, None, None
+                if node.text is not None:
+                    yield None, node.text, None
+                for child in list(node):
+                    for b, t, e in walker(child):
+                        yield b, t, e
+                yield None, None, node
+
+            if node.tail is not None:
+                yield None, node.tail, None
              return
  
          def fix_format(text):
              return
  
          def fix_format(text):
@@ -383,39 +398,31 @@ class Index(BaseIndex):
  
          def add_part(snippets, **fields):
              doc = self.create_book_doc(book)
  
          def add_part(snippets, **fields):
              doc = self.create_book_doc(book)
-            for f in book_fields:
-                doc.add(f)
+            for n, v in book_fields.items():
+                doc[n] = v
  
  
-            doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
-            doc.add(NumericField("header_span", Field.Store.YES, True)\
-                    .setIntValue('header_span' in fields and fields['header_span'] or 1))
-            doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
+            doc['header_index'] = fields["header_index"]
+            doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
+            doc['header_type'] = fields['header_type']
  
  
-            doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
-                          Field.TermVector.WITH_POSITIONS_OFFSETS))
+            doc['text'] = fields['text']
  
  
-            snip_pos = snippets.add(fields["content"])
-            doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
-            doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
+            # snippets
+            snip_pos = snippets.add(fields["text"])
+
+            doc['snippets_position'] = snip_pos[0]
+            doc['snippets_length'] = snip_pos[1]
+            if snippets.revision:
+                doc["snippets_revision"] = snippets.revision
  
              if 'fragment_anchor' in fields:
  
              if 'fragment_anchor' in fields:
-                doc.add(Field("fragment_anchor", fields['fragment_anchor'],
-                              Field.Store.YES, Field.Index.NOT_ANALYZED))
+                doc["fragment_anchor"] = fields['fragment_anchor']
  
              if 'themes' in fields:
  
              if 'themes' in fields:
-                themes, themes_pl = zip(*[
-                    (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
-                     Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
-                     for theme in fields['themes']])
-
-                themes = self.add_gaps(themes, 'themes')
-                themes_pl = self.add_gaps(themes_pl, 'themes_pl')
-
-                for t in themes:
-                    doc.add(t)
-                for t in themes_pl:
-                    doc.add(t)
-
+                doc['themes'] = fields['themes']
+            doc['uid'] = "part%s%s%s" % (doc['header_index'],
+                                         doc['header_span'],
+                                         doc.get('fragment_anchor', ''))
              return doc
  
          def give_me_utf8(s):
              return doc
  
          def give_me_utf8(s):
@@ -426,9 +433,8 @@ class Index(BaseIndex):
  
          fragments = {}
          snippets = Snippets(book.id).open('w')
  
          fragments = {}
          snippets = Snippets(book.id).open('w')
-        position = 0
          try:
          try:
-            for header in list(master):
+            for header, position in zip(list(master), range(len(master))):
  
                  if header.tag in self.skip_header_tags:
                      continue
  
                  if header.tag in self.skip_header_tags:
                      continue
@@ -437,35 +443,49 @@ class Index(BaseIndex):
  
                  # section content
                  content = []
  
                  # section content
                  content = []
-                footnote = None
+                footnote = []
  
  
-                for start, end in walker(header, ignore_tags=self.ignore_content_tags):
+                def all_content(text):
+                    for frag in fragments.values():
+                        frag['text'].append(text)
+                    content.append(text)
+                handle_text = [all_content]
+
+                for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
                      # handle footnotes
                      if start is not None and start.tag in self.footnote_tags:
                      # handle footnotes
                      if start is not None and start.tag in self.footnote_tags:
-                        footnote = ' '.join(start.itertext())
-                    elif end is not None and footnote is not None and end.tag in self.footnote_tags:
-                        doc = add_part(snippets, header_index=position, header_type=header.tag,
-                                       content=footnote)
+                        footnote = []
  
  
-                        self.index.addDocument(doc)
+                        def collect_footnote(t):
+                            footnote.append(t)
  
  
-                        footnote = None
+                        handle_text.append(collect_footnote)
+                    elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
+                        handle_text.pop()
+                        doc = add_part(snippets, header_index=position, header_type=header.tag,
+                                       text=u''.join(footnote),
+                                       is_footnote=True)
+                        self.index.add(doc)
+                        footnote = []
  
                      # handle fragments and themes.
                      if start is not None and start.tag == 'begin':
                          fid = start.attrib['id'][1:]
  
                      # handle fragments and themes.
                      if start is not None and start.tag == 'begin':
                          fid = start.attrib['id'][1:]
-                        fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
+                        fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
  
  
+                    # themes for this fragment
                      elif start is not None and start.tag == 'motyw':
                          fid = start.attrib['id'][1:]
                      elif start is not None and start.tag == 'motyw':
                          fid = start.attrib['id'][1:]
+                        handle_text.append(None)
                          if start.text is not None:
                          if start.text is not None:
-                            fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
+                            fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
+                    elif end is not None and end.tag == 'motyw':
+                        handle_text.pop()
  
                      elif start is not None and start.tag == 'end':
                          fid = start.attrib['id'][1:]
                          if fid not in fragments:
                              continue  # a broken <end> node, skip it
  
                      elif start is not None and start.tag == 'end':
                          fid = start.attrib['id'][1:]
                          if fid not in fragments:
                              continue  # a broken <end> node, skip it
-                                      #                        import pdb; pdb.set_trace()
                          frag = fragments[fid]
                          if frag['themes'] == []:
                              continue  # empty themes list.
                          frag = fragments[fid]
                          if frag['themes'] == []:
                              continue  # empty themes list.
@@ -476,162 +496,75 @@ class Index(BaseIndex):
                                         header_index=frag['start_section'],
                                         header_span=position - frag['start_section'] + 1,
                                         fragment_anchor=fid,
                                         header_index=frag['start_section'],
                                         header_span=position - frag['start_section'] + 1,
                                         fragment_anchor=fid,
-                                       content=fix_format(frag['content']),
+                                       text=fix_format(frag['text']),
                                         themes=frag['themes'])
                                         themes=frag['themes'])
-
-                        self.index.addDocument(doc)
+                        self.index.add(doc)
  
                          # Collect content.
  
                          # Collect content.
-                    elif start is not None:
-                        for frag in fragments.values():
-                            frag['content'].append(start.text)
-                        content.append(start.text)
-                    elif end is not None:
-                        for frag in fragments.values():
-                            frag['content'].append(end.tail)
-                        content.append(end.tail)
+
+                    if text is not None and handle_text is not []:
+                        hdl = handle_text[-1]
+                        if hdl is not None:
+                            hdl(text)
  
                          # in the end, add a section text.
  
                          # in the end, add a section text.
-                doc = add_part(snippets, header_index=position, header_type=header.tag,
-                               content=fix_format(content))
+                doc = add_part(snippets, header_index=position,
+                               header_type=header.tag, text=fix_format(content))
  
  
-                self.index.addDocument(doc)
-                position += 1
+                self.index.add(doc)
  
          finally:
              snippets.close()
  
  
  
          finally:
              snippets.close()
  
  
-def log_exception_wrapper(f):
-    def _wrap(*a):
-        try:
-            f(*a)
-        except Exception, e:
-            print("Error in indexing thread: %s" % e)
-            traceback.print_exc()
-            raise e
-    return _wrap
-
-
-class ReusableIndex(Index):
-    """
-    Works like index, but does not close/optimize Lucene index
-    until program exit (uses atexit hook).
-    This is usefull for importbooks command.
-
-    if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
-    """
-    index = None
-
-    def open(self, analyzer=None, threads=4):
-        if ReusableIndex.index is not None:
-            self.index = ReusableIndex.index
-        else:
-            print("opening index")
-            Index.open(self, analyzer)
-            ReusableIndex.index = self.index
-            atexit.register(ReusableIndex.close_reusable)
-
-    # def index_book(self, *args, **kw):
-    #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
-    #     ReusableIndex.pool_jobs.append(job)
-
-    @staticmethod
-    def close_reusable():
-        if ReusableIndex.index is not None:
-            ReusableIndex.index.optimize()
-            ReusableIndex.index.close()
-            ReusableIndex.index = None
-
-    def close(self):
-        pass
-
-
-class JoinSearch(object):
-    """
-    This mixin could be used to handle block join queries.
-    (currently unused)
-    """
-    def __init__(self, *args, **kw):
-        super(JoinSearch, self).__init__(*args, **kw)
-
-    def wrapjoins(self, query, fields=[]):
-        """
-        This functions modifies the query in a recursive way,
-        so Term and Phrase Queries contained, which match
-        provided fields are wrapped in a BlockJoinQuery,
-        and so delegated to children documents.
-        """
-        if BooleanQuery.instance_(query):
-            qs = BooleanQuery.cast_(query)
-            for clause in qs:
-                clause = BooleanClause.cast_(clause)
-                clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
-            return qs
-        else:
-            termset = HashSet()
-            query.extractTerms(termset)
-            for t in termset:
-                t = Term.cast_(t)
-                if t.field() not in fields:
-                    return query
-            return BlockJoinQuery(query, self.parent_filter,
-                                  BlockJoinQuery.ScoreMode.Total)
-
-    def bsearch(self, query, max_results=50):
-        q = self.query(query)
-        bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
-
-        tops = self.searcher.search(bjq, max_results)
-        bks = []
-        for found in tops.scoreDocs:
-            doc = self.searcher.doc(found.doc)
-            bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
-        return (bks, tops.totalHits)
-
-
  class SearchResult(object):
  class SearchResult(object):
-    def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
-        if tokens_cache is None: tokens_cache = {}
-
-        if score:
-            self._score = score
-        else:
-            self._score = scoreDocs.score
-
+    def __init__(self, doc, how_found=None, query=None, query_terms=None):
+        #        self.search = search
          self.boost = 1.0
          self.boost = 1.0
-
          self._hits = []
          self._processed_hits = None  # processed hits
          self._hits = []
          self._processed_hits = None  # processed hits
+        self.snippets = []
+        self.query_terms = query_terms
  
  
-        stored = search.searcher.doc(scoreDocs.doc)
-        self.book_id = int(stored.get("book_id"))
-
-        header_type = stored.get("header_type")
-        if not header_type:
-            return
-
-        sec = (header_type, int(stored.get("header_index")))
-        header_span = stored.get('header_span')
-        header_span = header_span is not None and int(header_span) or 1
-
-        fragment = stored.get("fragment_anchor")
+        if 'score' in doc:
+            self._score = doc['score']
+        else:
+            self._score = 0
  
  
-        pd = stored.get("published_date")
-        if pd is None:
-            print "published_date is none for book %d" % self.book_id
-            pd = 0
-        self.published_date = int(pd)
+        self.book_id = int(doc["book_id"])
  
  
-        if snippets:
-            snippets = snippets.replace("/\n", "\n")
-        hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
+        try:
+            self.published_date = int(doc.get("published_date"))
+        except ValueError:
+            self.published_date = 0
+
+        # content hits
+        header_type = doc.get("header_type", None)
+        # we have a content hit in some header of fragment
+        if header_type is not None:
+            sec = (header_type, int(doc["header_index"]))
+            header_span = doc['header_span']
+            header_span = header_span is not None and int(header_span) or 1
+            fragment = doc.get("fragment_anchor", None)
+            snippets_pos = (doc['snippets_position'], doc['snippets_length'])
+            snippets_rev = doc.get('snippets_revision', None)
+
+            hit = (sec + (header_span,), fragment, self._score, {
+                'how_found': how_found,
+                'snippets_pos': snippets_pos,
+                'snippets_revision': snippets_rev,
+                'themes': doc.get('themes', []),
+                'themes_pl': doc.get('themes_pl', [])
+                })
+
+            self._hits.append(hit)
  
  
-        self._hits.append(hit)
+    def __unicode__(self):
+        return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
+            (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
  
  
-        self.search = search
-        self.searched = searched
-        self.tokens_cache = tokens_cache
+    def __str__(self):
+        return unicode(self).encode('utf-8')
  
      @property
      def score(self):
  
      @property
      def score(self):
@@ -642,92 +575,108 @@ class SearchResult(object):
              raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
          self._hits += other._hits
          if other.score > self.score:
              raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
          self._hits += other._hits
          if other.score > self.score:
-            self.score = other.score
+            self._score = other._score
          return self
  
      def get_book(self):
          return self
  
      def get_book(self):
-        return catalogue.models.Book.objects.get(id=self.book_id)
+        if hasattr(self, '_book'):
+            return self._book
+        self._book = catalogue.models.Book.objects.get(id=self.book_id)
+        return self._book
  
      book = property(get_book)
  
  
      book = property(get_book)
  
+    POSITION = 0
+    FRAGMENT = 1
+    POSITION_INDEX = 1
+    POSITION_SPAN = 2
+    SCORE = 2
+    OTHER = 3
+
      @property
      def hits(self):
          if self._processed_hits is not None:
              return self._processed_hits
  
      @property
      def hits(self):
          if self._processed_hits is not None:
              return self._processed_hits
  
-        POSITION = 0
-        FRAGMENT = 1
-        POSITION_INDEX = 1
-        POSITION_SPAN = 2
-        SCORE = 2
-        OTHER = 3
-
          # to sections and fragments
          # to sections and fragments
-        frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
-        sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
+        frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
+
+        sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
+
+        # sections not covered by fragments
          sect = filter(lambda s: 0 == len(filter(
          sect = filter(lambda s: 0 == len(filter(
-            lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
-            and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
+            lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
+            and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
              frags)), sect)
  
          hits = []
  
              frags)), sect)
  
          hits = []
  
-        # remove duplicate fragments
-        fragments = {}
-        for f in frags:
-            fid = f[FRAGMENT]
-            if fid in fragments:
-                if fragments[fid][SCORE] >= f[SCORE]:
-                    continue
-            fragments[fid] = f
-        frags = fragments.values()
+        def remove_duplicates(lst, keyfn, compare):
+            els = {}
+            for e in lst:
+                eif = keyfn(e)
+                if eif in els:
+                    if compare(els[eif], e) >= 1:
+                        continue
+                els[eif] = e
+            return els.values()
+
+        # remove fragments with duplicated fid's and duplicated snippets
+        frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
+        # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
+        #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
  
          # remove duplicate sections
          sections = {}
  
          for s in sect:
  
          # remove duplicate sections
          sections = {}
  
          for s in sect:
-            si = s[POSITION][POSITION_INDEX]
+            si = s[self.POSITION][self.POSITION_INDEX]
              # skip existing
              if si in sections:
              # skip existing
              if si in sections:
-                if sections[si]['score'] >= s[SCORE]:
+                if sections[si]['score'] >= s[self.SCORE]:
                      continue
  
                      continue
  
-            m = {'score': s[SCORE],
-                 'section_number': s[POSITION][POSITION_INDEX] + 1,
+            m = {'score': s[self.SCORE],
+                 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
                   }
                   }
-            m.update(s[OTHER])
+            m.update(s[self.OTHER])
              sections[si] = m
  
          hits = sections.values()
  
          for f in frags:
              try:
              sections[si] = m
  
          hits = sections.values()
  
          for f in frags:
              try:
-                frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
+                frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
              except catalogue.models.Fragment.DoesNotExist:
                  # stale index
                  continue
              except catalogue.models.Fragment.DoesNotExist:
                  # stale index
                  continue
-
              # Figure out if we were searching for a token matching some word in theme name.
              themes = frag.tags.filter(category='theme')
              # Figure out if we were searching for a token matching some word in theme name.
              themes = frag.tags.filter(category='theme')
-            themes_hit = []
-            if self.searched is not None:
-                tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
-                for theme in themes:
-                    name_tokens = self.search.get_tokens(theme.name, 'POLISH')
-                    print "THEME HIT: %s in %s" % (tokens, name_tokens)
-                    for t in tokens:
-                        if t in name_tokens:
-                            if not theme in themes_hit:
-                                themes_hit.append(theme)
+            themes_hit = set()
+            if self.query_terms is not None:
+                for i in range(0, len(f[self.OTHER]['themes'])):
+                    tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
+                    tms = map(unicode.lower, tms)
+                    for qt in self.query_terms:
+                        if qt in tms:
+                            themes_hit.add(f[self.OTHER]['themes'][i])
                              break
  
                              break
  
-            m = {'score': f[SCORE],
+            def theme_by_name(n):
+                th = filter(lambda t: t.name == n, themes)
+                if th:
+                    return th[0]
+                else:
+                    return None
+            themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
+
+            m = {'score': f[self.SCORE],
                   'fragment': frag,
                   'fragment': frag,
-                 'section_number': f[POSITION][POSITION_INDEX] + 1,
+                 'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
                   'themes': themes,
                   'themes_hit': themes_hit
                   }
                   'themes': themes,
                   'themes_hit': themes_hit
                   }
-            m.update(f[OTHER])
+            m.update(f[self.OTHER])
              hits.append(m)
  
          hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
              hits.append(m)
  
          hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
@@ -736,9 +685,6 @@ class SearchResult(object):
  
          return hits
  
  
          return hits
  
-    def __unicode__(self):
-        return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
-
      @staticmethod
      def aggregate(*result_lists):
          books = {}
      @staticmethod
      def aggregate(*result_lists):
          books = {}
@@ -746,7 +692,6 @@ class SearchResult(object):
              for r in rl:
                  if r.book_id in books:
                      books[r.book_id].merge(r)
              for r in rl:
                  if r.book_id in books:
                      books[r.book_id].merge(r)
-                    #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
                  else:
                      books[r.book_id] = r
          return books.values()
                  else:
                      books[r.book_id] = r
          return books.values()
@@ -759,559 +704,239 @@ class SearchResult(object):
          else:
              return c
  
          else:
              return c
  
+    def __len__(self):
+        return len(self.hits)
  
  
-class Hint(object):
-    """
-    Given some hint information (information we already know about)
-    our search target - like author, title (specific book), epoch, genre, kind
-    we can narrow down search using filters.
-    """
-    def __init__(self, search):
-        """
-        Accepts a Searcher instance.
-        """
-        self.search = search
-        self.book_tags = {}
-        self.part_tags = []
-        self._books = []
-
-    def books(self, *books):
-        """
-        Give a hint that we search these books.
-        """
-        self._books = books
-
-    def tags(self, tags):
-        """
-        Give a hint that these Tag objects (a list of)
-        is necessary.
-        """
-        for t in tags:
-            if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
-                lst = self.book_tags.get(t.category, [])
-                lst.append(t)
-                self.book_tags[t.category] = lst
-            if t.category in ['theme', 'theme_pl']:
-                self.part_tags.append(t)
-
-    def tag_filter(self, tags, field='tags'):
-        """
-        Given a lsit of tags and an optional field (but they are normally in tags field)
-        returns a filter accepting only books with specific tags.
-        """
-        q = BooleanQuery()
-
-        for tag in tags:
-            toks = self.search.get_tokens(tag.name, field=field)
-            tag_phrase = PhraseQuery()
-            for tok in toks:
-                tag_phrase.add(Term(field, tok))
-            q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
+    def snippet_pos(self, idx=0):
+        return self.hits[idx]['snippets_pos']
  
  
-        return QueryWrapperFilter(q)
-
-    def book_filter(self):
-        """
-        Filters using book tags (all tag kinds except a theme)
-        """
-        tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
-        if tags:
-            return self.tag_filter(tags)
-        else:
+    def snippet_revision(self, idx=0):
+        try:
+            return self.hits[idx]['snippets_revision']
+        except:
              return None
  
              return None
  
-    def part_filter(self):
-        """
-        This filter can be used to look for book parts.
-        It filters on book id and/or themes.
-        """
-        fs = []
-        if self.part_tags:
-            fs.append(self.tag_filter(self.part_tags, field='themes'))
-
-        if self._books != []:
-            bf = BooleanFilter()
-            for b in self._books:
-                id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
-                bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
-            fs.append(bf)
-
-        return Search.chain_filters(fs)
-
-    def should_search_for_book(self):
-        return self._books == []
-
-    def just_search_in(self, all):
-        """Holds logic to figure out which indexes should be search, when we have some hinst already"""
-        some = []
-        for field in all:
-            if field == 'authors' and 'author' in self.book_tags:
-                continue
-            if field == 'title' and self._books != []:
-                continue
-            if (field == 'themes' or field == 'themes_pl') and self.part_tags:
-                continue
-            some.append(field)
-        return some
-
  
  
-class Search(IndexStore):
+class Search(SolrIndex):
      """
      Search facilities.
      """
      """
      Search facilities.
      """
-    def __init__(self, default_field="content"):
-        IndexStore.__init__(self)
-        self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
-        # self.analyzer = WLAnalyzer()
-        self.searcher = IndexSearcher(self.store, True)
-        self.parser = QueryParser(Version.LUCENE_34, default_field,
-                                  self.analyzer)
-
-        self.parent_filter = TermsFilter()
-        self.parent_filter.addTerm(Term("is_book", "true"))
-
-    def query(self, query):
-        """Parse query in default Lucene Syntax. (for humans)
-        """
-        return self.parser.parse(query)
-
-    def simple_search(self, query, max_results=50):
-        """Runs a query for books using lucene syntax. (for humans)
-        Returns (books, total_hits)
-        """
-
-        tops = self.searcher.search(self.query(query), max_results)
-        bks = []
-        for found in tops.scoreDocs:
-            doc = self.searcher.doc(found.doc)
-            bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
-        return (bks, tops.totalHits)
-
-    def get_tokens(self, searched, field='content', cached=None):
-        """returns tokens analyzed by a proper (for a field) analyzer
-        argument can be: StringReader, string/unicode, or tokens. In the last case
-        they will just be returned (so we can reuse tokens, if we don't change the analyzer)
-        """
-        if cached is not None and field in cached:
-            return cached[field]
+    def __init__(self, default_field="text"):
+        super(Search, self).__init__(mode='r')
  
  
-        if isinstance(searched, str) or isinstance(searched, unicode):
-            searched = StringReader(searched)
-        elif isinstance(searched, list):
-            return searched
  
  
-        searched.reset()
-        tokens = self.analyzer.reusableTokenStream(field, searched)
-        toks = []
-        while tokens.incrementToken():
-            cta = tokens.getAttribute(CharTermAttribute.class_)
-            toks.append(cta.toString())
-
-        if cached is not None:
-            cached[field] = toks
-
-        return toks
-
-    def fuzziness(self, fuzzy):
-        """Helper method to sanitize fuzziness"""
-        if not fuzzy:
-            return None
-        if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
-            return fuzzy
-        else:
-            return 0.5
-
-    def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
-        """
-        Return a PhraseQuery with a series of tokens.
-        """
-        if fuzzy:
-            phrase = MultiPhraseQuery()
-            for t in tokens:
-                term = Term(field, t)
-                fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
-                fuzzterms = []
-
-                while True:
-                    #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
-                    ft = fuzzterm.term()
-                    if ft:
-                        fuzzterms.append(ft)
-                    if not fuzzterm.next(): break
-                if fuzzterms:
-                    phrase.add(JArray('object')(fuzzterms, Term))
-                else:
-                    phrase.add(term)
-        else:
-            phrase = PhraseQuery()
-            phrase.setSlop(slop)
-            for t in tokens:
-                term = Term(field, t)
-                phrase.add(term)
-        return phrase
-
-    def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
+    def make_term_query(self, query, field='text', modal=operator.or_):
          """
          Returns term queries joined by boolean query.
          modal - applies to boolean query
          fuzzy - should the query by fuzzy.
          """
          """
          Returns term queries joined by boolean query.
          modal - applies to boolean query
          fuzzy - should the query by fuzzy.
          """
-        q = BooleanQuery()
-        for t in tokens:
-            term = Term(field, t)
-            if fuzzy:
-                term = FuzzyQuery(term, self.fuzziness(fuzzy))
-            else:
-                term = TermQuery(term)
-            q.add(BooleanClause(term, modal))
+        if query is None: query = ''
+        q = self.index.Q()
+        q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
+                        query.split(r" ")), q)
+
          return q
  
          return q
  
-    def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
-                      filters=None, tokens_cache=None, boost=None, snippets=False):
+    def search_phrase(self, searched, field='text', book=False,
+                      filters=None,
+                      snippets=False):
          if filters is None: filters = []
          if filters is None: filters = []
-        if tokens_cache is None: tokens_cache = {}
-
-        tokens = self.get_tokens(searched, field, cached=tokens_cache)
-
-        query = self.make_phrase(tokens, field=field, fuzzy=fuzzy)
-        if book:
-            filters.append(self.term_filter(Term('is_book', 'true')))
-        top = self.searcher.search(query, self.chain_filters(filters), max_results)
+        if book: filters.append(self.index.Q(is_book=True))
  
  
-        return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
+        q = self.index.query(**{field: searched})
+        q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
+        res = q.execute()
+        return [SearchResult(found, how_found=u'search_phrase') for found in res]
  
  
-    def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
-                    filters=None, tokens_cache=None, boost=None):
+    def search_some(self, searched, fields, book=True,
+                    filters=None, snippets=True, query_terms=None):
+        assert isinstance(fields, list)
          if filters is None: filters = []
          if filters is None: filters = []
-        if tokens_cache is None: tokens_cache = {}
+        if book: filters.append(self.index.Q(is_book=True))
  
  
-        if book:
-            filters.append(self.term_filter(Term('is_book', 'true')))
-
-        query = BooleanQuery()
+        query = self.index.Q()
  
          for fld in fields:
  
          for fld in fields:
-            tokens = self.get_tokens(searched, fld, cached=tokens_cache)
-
-            query.add(BooleanClause(self.make_term_query(tokens, field=fld,
-                                fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
-
-        top = self.searcher.search(query, self.chain_filters(filters), max_results)
-
-        return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
-                             snippets=self.get_snippets(found, query)) for found in top.scoreDocs]
-
-    def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
-        """
-        Search for perfect book matches. Just see if the query matches with some author or title,
-        taking hints into account.
-        """
-        fields_to_search = ['authors', 'title']
-        only_in = None
-        if hint:
-            if not hint.should_search_for_book():
-                return []
-            fields_to_search = hint.just_search_in(fields_to_search)
-            only_in = hint.book_filter()
-
-        qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
-
-        books = []
-        for q in qrys:
-            top = self.searcher.search(q,
-                self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
-                max_results)
-            for found in top.scoreDocs:
-                books.append(SearchResult(self, found, how_found="search_perfect_book"))
-        return books
-
-    def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
-        fields_to_search = ['tags', 'authors', 'title']
-
-        only_in = None
-        if hint:
-            if not hint.should_search_for_book():
-                return []
-            fields_to_search = hint.just_search_in(fields_to_search)
-            only_in = hint.book_filter()
-
-        tokens = self.get_tokens(searched, field='SIMPLE')
-
-        q = BooleanQuery()
-
-        for fld in fields_to_search:
-            q.add(BooleanClause(self.make_term_query(tokens, field=fld,
-                                fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
-
-        books = []
-        top = self.searcher.search(q,
-                                   self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
-            max_results)
-        for found in top.scoreDocs:
-            books.append(SearchResult(self, found, how_found="search_book"))
-
-        return books
-
-    def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
-        """
-        Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
-        some part/fragment of the book.
-        """
-        qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
-
-        flt = None
-        if hint:
-            flt = hint.part_filter()
+            query = self.index.Q(query | self.make_term_query(searched, fld))
  
  
-        books = []
-        for q in qrys:
-            top = self.searcher.search(q,
-                                       self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
-                                                           flt]),
-                                       max_results)
-            for found in top.scoreDocs:
-                books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
+        query = self.index.query(query)
+        query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
+        res = query.execute()
+        return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
  
  
-        return books
  
  
-    def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
+    def search_everywhere(self, searched, query_terms=None):
          """
          Tries to use search terms to match different fields of book (or its parts).
          E.g. one word can be an author survey, another be a part of the title, and the rest
          are some words from third chapter.
          """
          """
          Tries to use search terms to match different fields of book (or its parts).
          E.g. one word can be an author survey, another be a part of the title, and the rest
          are some words from third chapter.
          """
-        if tokens_cache is None: tokens_cache = {}
          books = []
          books = []
-        only_in = None
-
-        if hint:
-            only_in = hint.part_filter()
-
          # content only query : themes x content
          # content only query : themes x content
-        q = BooleanQuery()
-
-        tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
-        tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
+        q = self.make_term_query(searched, 'text')
+        q_themes = self.make_term_query(searched, 'themes_pl')
  
  
-        # only search in themes when we do not already filter by themes
-        if hint is None or hint.just_search_in(['themes']) != []:
-            q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
-                                                     fuzzy=fuzzy), BooleanClause.Occur.MUST))
+        query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
+        res = query.execute()
  
  
-        q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
-                                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
-
-        topDocs = self.searcher.search(q, only_in, max_results)
-        for found in topDocs.scoreDocs:
-            books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
-            print "* %s theme x content: %s" % (searched, books[-1]._hits)
+        for found in res:
+            books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
  
          # query themes/content x author/title/tags
  
          # query themes/content x author/title/tags
-        q = BooleanQuery()
-        in_content = BooleanQuery()
-        in_meta = BooleanQuery()
+        in_content = self.index.Q()
+        in_meta = self.index.Q()
  
  
-        for fld in ['themes_pl', 'content']:
-            in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
+        for fld in ['themes_pl', 'text']:
+            in_content |= self.make_term_query(searched, field=fld)
  
          for fld in ['tags', 'authors', 'title']:
  
          for fld in ['tags', 'authors', 'title']:
-            in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
+            in_meta |= self.make_term_query(searched, field=fld)
  
  
-        q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
-        q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
+        q = in_content & in_meta
+        res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
  
  
-        topDocs = self.searcher.search(q, only_in, max_results)
-        for found in topDocs.scoreDocs:
-            books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
-            print "* %s scatter search: %s" % (searched, books[-1]._hits)
+        for found in res:
+            books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
  
          return books
  
  
          return books
  
-    # def multisearch(self, query, max_results=50):
-    #     """
-    #     Search strategy:
-    #     - (phrase) OR -> content
-    #                   -> title
-    #                   -> authors
-    #     - (keywords)  -> authors
-    #                   -> motyw
-    #                   -> tags
-    #                   -> content
-    #     """
-        # queryreader = StringReader(query)
-        # tokens = self.get_tokens(queryreader)
-
-        # top_level = BooleanQuery()
-        # Should = BooleanClause.Occur.SHOULD
-
-        # phrase_level = BooleanQuery()
-        # phrase_level.setBoost(1.3)
-
-        # p_content = self.make_phrase(tokens, joined=True)
-        # p_title = self.make_phrase(tokens, 'title')
-        # p_author = self.make_phrase(tokens, 'author')
-
-        # phrase_level.add(BooleanClause(p_content, Should))
-        # phrase_level.add(BooleanClause(p_title, Should))
-        # phrase_level.add(BooleanClause(p_author, Should))
-
-        # kw_level = BooleanQuery()
-
-        # kw_level.add(self.make_term_query(tokens, 'author'), Should)
-        # j_themes = self.make_term_query(tokens, 'themes', joined=True)
-        # kw_level.add(j_themes, Should)
-        # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
-        # j_con = self.make_term_query(tokens, joined=True)
-        # kw_level.add(j_con, Should)
-
-        # top_level.add(BooleanClause(phrase_level, Should))
-        # top_level.add(BooleanClause(kw_level, Should))
-
-        # return None
-
-    def get_snippets(self, scoreDoc, query, field='content'):
+    def get_snippets(self, searchresult, query, field='text', num=1):
          """
          Returns a snippet for found scoreDoc.
          """
          """
          Returns a snippet for found scoreDoc.
          """
-        htmlFormatter = SimpleHTMLFormatter()
-        highlighter = Highlighter(htmlFormatter, QueryScorer(query))
-
-        stored = self.searcher.doc(scoreDoc.doc)
-
-        position = stored.get('snippets_position')
-        length = stored.get('snippets_length')
-        if position is None or length is None:
-            return None
-        # locate content.
-        snippets = Snippets(stored.get('book_id')).open()
+        maxnum = len(searchresult)
+        if num is None or num < 0 or num > maxnum:
+            num = maxnum
+        book_id = searchresult.book_id
+        revision = searchresult.snippet_revision()
+        snippets = Snippets(book_id, revision=revision)
+        snips = [None] * maxnum
          try:
          try:
-            text = snippets.get((int(position),
-                                 int(length)))
+            snippets.open()
+            idx = 0
+            while idx < maxnum and num > 0:
+                position, length = searchresult.snippet_pos(idx)
+                if position is None or length is None:
+                    continue
+                text = snippets.get((int(position),
+                                     int(length)))
+                snip = self.index.highlight(text=text, field=field, q=query)
+                snips[idx] = snip
+                if snip:
+                    num -= 1
+                idx += 1
+
+        except IOError, e:
+            log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
+            return []
          finally:
              snippets.close()
  
          finally:
              snippets.close()
  
-        tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
-        #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
-        snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
+            # remove verse end markers..
+        snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
  
  
-        return snip
+        searchresult.snippets = snips
  
  
-    @staticmethod
-    def enum_to_array(enum):
+        return snips
+
+    def hint_tags(self, query, pdcounter=True, prefix=True):
          """
          """
-        Converts a lucene TermEnum to array of Terms, suitable for
-        addition to queries
+        Return auto-complete hints for tags
+        using prefix search.
          """
          """
-        terms = []
-
-        while True:
-            t = enum.term()
-            if t:
-                terms.append(t)
-            if not enum.next(): break
+        q = self.index.Q()
+        query = query.strip()
+        for field in ['tag_name', 'tag_name_pl']:
+            if prefix:
+                q |= self.index.Q(**{field: query + "*"})
+            else:
+                q |= self.make_term_query(query, field=field)
+        qu = self.index.query(q).exclude(tag_category="book")
  
  
-        if terms:
-            return JArray('object')(terms, Term)
+        return self.search_tags(qu, pdcounter=pdcounter)
  
  
-    def search_tags(self, query, filter=None, max_results=40):
+    def search_tags(self, query, filters=None, pdcounter=False):
          """
          Search for Tag objects using query.
          """
          """
          Search for Tag objects using query.
          """
-        tops = self.searcher.search(query, filter, max_results)
+        if not filters: filters = []
+        if not pdcounter:
+            filters.append(~self.index.Q(is_pdcounter=True))
+        res = self.apply_filters(query, filters).execute()
  
          tags = []
  
          tags = []
-        for found in tops.scoreDocs:
-            doc = self.searcher.doc(found.doc)
-            tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
-            tags.append(tag)
-            print "%s (%d) -> %f" % (tag, tag.id, found.score)
+        pd_tags = []
  
  
-        return tags
-
-    def search_books(self, query, filter=None, max_results=10):
-        """
-        Searches for Book objects using query
-        """
-        bks = []
-        tops = self.searcher.search(query, filter, max_results)
-        for found in tops.scoreDocs:
-            doc = self.searcher.doc(found.doc)
-            bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
-        return bks
-
-    def create_prefix_phrase(self, toks, field):
-        q = MultiPhraseQuery()
-        for i in range(len(toks)):
-            t = Term(field, toks[i])
-            if i == len(toks) - 1:
-                pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
-                if pterms:
-                    q.add(pterms)
+        for doc in res:
+            is_pdcounter = doc.get('is_pdcounter', False)
+            category = doc.get('tag_category')
+            try:
+                if is_pdcounter == True:
+                    if category == 'pd_author':
+                        tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
+                    elif category == 'pd_book':
+                        tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
+                        tag.category = 'pd_book'  # make it look more lik a tag.
+                    else:
+                        print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)).encode('utf-8')
+                    pd_tags.append(tag)
                  else:
                  else:
-                    q.add(t)
-            else:
-                q.add(t)
-        return q
+                    tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
+                    tags.append(tag)
  
  
-    @staticmethod
-    def term_filter(term, inverse=False):
-        only_term = TermsFilter()
-        only_term.addTerm(term)
-
-        if inverse:
-            neg = BooleanFilter()
-            neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
-            only_term = neg
-
-        return only_term
-
-    def hint_tags(self, string, max_results=50):
-        """
-        Return auto-complete hints for tags
-        using prefix search.
-        """
-        toks = self.get_tokens(string, field='SIMPLE')
-        top = BooleanQuery()
+            except catalogue.models.Tag.DoesNotExist: pass
+            except PDCounterAuthor.DoesNotExist: pass
+            except PDCounterBook.DoesNotExist: pass
  
  
-        for field in ['tag_name', 'tag_name_pl']:
-            q = self.create_prefix_phrase(toks, field)
-            top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
+        tags_slugs = set(map(lambda t: t.slug, tags))
+        tags = tags + filter(lambda t: not t.slug in tags_slugs, pd_tags)
  
  
-        no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
+        log.debug('search_tags: %s' % tags)
  
  
-        return self.search_tags(top, no_book_cat, max_results=max_results)
+        return tags
  
  
-    def hint_books(self, string, max_results=50):
+    def hint_books(self, query, prefix=True):
          """
          Returns auto-complete hints for book titles
          Because we do not index 'pseudo' title-tags.
          Prefix search.
          """
          """
          Returns auto-complete hints for book titles
          Because we do not index 'pseudo' title-tags.
          Prefix search.
          """
-        toks = self.get_tokens(string, field='SIMPLE')
-
-        q = self.create_prefix_phrase(toks, 'title')
-
-        return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
+        q = self.index.Q()
+        query = query.strip()
+        if prefix:
+            q |= self.index.Q(title=query + "*")
+        else:
+            q |= self.make_term_query(query, field='title')
+        qu = self.index.query(q)
+        only_books = self.index.Q(is_book=True)
+        return self.search_books(qu, [only_books])
  
  
-    @staticmethod
-    def chain_filters(filters, op=ChainedFilter.AND):
+    def search_books(self, query, filters=None, max_results=10):
          """
          """
-        Chains a filter list together
+        Searches for Book objects using query
          """
          """
-        filters = filter(lambda x: x is not None, filters)
-        if not filters or filters is []:
-            return None
-        chf = ChainedFilter(JArray('object')(filters, Filter), op)
-        return chf
+        bks = []
+        bks_found = set()
+        query = query.query(is_book=True)
+        res = self.apply_filters(query, filters).field_limit(['book_id'])
+        for r in res:
+            try:
+                bid = r['book_id']
+                if not bid in bks_found:
+                    bks.append(catalogue.models.Book.objects.get(id=bid))
+                    bks_found.add(bid)
+            except catalogue.models.Book.DoesNotExist: pass
+        return bks
+ 
  
  
-    def filtered_categories(self, tags):
+    @staticmethod
+    def apply_filters(query, filters):
          """
          """
-        Return a list of tag categories, present in tags list.
+        Apply filters to a query
          """
          """
-        cats = {}
-        for t in tags:
-            cats[t.category] = True
-        return cats.keys()
-
-    def hint(self):
-        return Hint(self)
+        if filters is None: filters = []
+        filters = filter(lambda x: x is not None, filters)
+        for f in filters:
+            query = query.query(f)
+        return query