Django 1.6 and some cleaning.
[wolnelektury.git] / apps / search / index.py
index 6068fa2..7fb60b5 100644 (file)
@@ -1,25 +1,9 @@
 # -*- coding: utf-8 -*-
 # -*- coding: utf-8 -*-
-
+# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+#
 from django.conf import settings
 from django.conf import settings
-from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
-    File, Field, Integer, \
-    NumericField, Version, Document, JavaError, IndexSearcher, \
-    QueryParser, PerFieldAnalyzerWrapper, \
-    SimpleAnalyzer, PolishAnalyzer, ArrayList, \
-    KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
-    BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
-    HashSet, BooleanClause, Term, CharTermAttribute, \
-    PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
-    FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
-    SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
-    BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
-    initVM, CLASSPATH, JArray, JavaError
-    # KeywordAnalyzer
-
-# Initialize jvm
-JVM = initVM(CLASSPATH)
-
-import sys
+
 import os
 import re
 import errno
 import os
 import re
 import errno
@@ -27,79 +11,20 @@ from librarian import dcparser
 from librarian.parser import WLDocument
 from lxml import etree
 import catalogue.models
 from librarian.parser import WLDocument
 from lxml import etree
 import catalogue.models
-from multiprocessing.pool import ThreadPool
-from threading import current_thread
-import atexit
+from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
+from itertools import chain
 import traceback
 import traceback
+import logging
+log = logging.getLogger('search')
+import sunburnt
+import custom
+import operator
 
 
+log = logging.getLogger('search')
 
 
-class WLAnalyzer(PerFieldAnalyzerWrapper):
-    def __init__(self):
-        polish = PolishAnalyzer(Version.LUCENE_34)
-        #        polish_gap.setPositionIncrementGap(999)
-
-        simple = SimpleAnalyzer(Version.LUCENE_34)
-        #        simple_gap.setPositionIncrementGap(999)
-
-        keyword = KeywordAnalyzer(Version.LUCENE_34)
-
-        # not sure if needed: there's NOT_ANALYZED meaning basically the same
-
-        PerFieldAnalyzerWrapper.__init__(self, polish)
-
-        self.addAnalyzer("tags", simple)
-        self.addAnalyzer("technical_editors", simple)
-        self.addAnalyzer("editors", simple)
-        self.addAnalyzer("url", keyword)
-        self.addAnalyzer("source_url", keyword)
-        self.addAnalyzer("source_name", simple)
-        self.addAnalyzer("publisher", simple)
-        self.addAnalyzer("authors", simple)
-        self.addAnalyzer("title", simple)
-
-        self.addAnalyzer("is_book", keyword)
-        # shouldn't the title have two forms? _pl and simple?
-
-        self.addAnalyzer("themes", simple)
-        self.addAnalyzer("themes_pl", polish)
-
-        self.addAnalyzer("tag_name", simple)
-        self.addAnalyzer("tag_name_pl", polish)
-
-        self.addAnalyzer("translators", simple)
-
-        self.addAnalyzer("KEYWORD", keyword)
-        self.addAnalyzer("SIMPLE", simple)
-        self.addAnalyzer("POLISH", polish)
-
-
-class IndexStore(object):
-    """
-    Provides access to search index.
-
-    self.store - lucene index directory
-    """
-    def __init__(self):
-        self.make_index_dir()
-        self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
-
-    def make_index_dir(self):
-        try:
-            os.makedirs(settings.SEARCH_INDEX)
-        except OSError as exc:
-            if exc.errno == errno.EEXIST:
-                pass
-            else: raise
-
-
-class IndexChecker(IndexStore):
-    def __init__(self):
-        IndexStore.__init__(self)
-
-    def check(self):
-        checker = CheckIndex(self.store)
-        status = checker.checkIndex()
-        return status
+class SolrIndex(object):
+    def __init__(self, mode=None):
+        self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
 
 
 class Snippets(object):
 
 
 class Snippets(object):
@@ -110,7 +35,7 @@ class Snippets(object):
     """
     SNIPPET_DIR = "snippets"
 
     """
     SNIPPET_DIR = "snippets"
 
-    def __init__(self, book_id):
+    def __init__(self, book_id, revision=None):
         try:
             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
         except OSError as exc:
         try:
             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
         except OSError as exc:
@@ -118,15 +43,32 @@ class Snippets(object):
                 pass
             else: raise
         self.book_id = book_id
                 pass
             else: raise
         self.book_id = book_id
+        self.revision = revision
         self.file = None
 
         self.file = None
 
+    @property
+    def path(self):
+        if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
+        else: fn = "%d" % self.book_id
+
+        return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
+
     def open(self, mode='r'):
         """
         Open the snippet file. Call .close() afterwards.
         """
         if not 'b' in mode:
             mode += 'b'
     def open(self, mode='r'):
         """
         Open the snippet file. Call .close() afterwards.
         """
         if not 'b' in mode:
             mode += 'b'
-        self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
+
+        if 'w' in mode:
+            if os.path.exists(self.path):
+                self.revision = 1
+                while True:
+                    if not os.path.exists(self.path):
+                        break
+                    self.revision += 1
+
+        self.file = open(self.path, mode)
         self.position = 0
         return self
 
         self.position = 0
         return self
 
@@ -153,87 +95,142 @@ class Snippets(object):
 
     def close(self):
         """Close snippet file"""
 
     def close(self):
         """Close snippet file"""
-        self.file.close()
-
-
-class BaseIndex(IndexStore):
-    """
-    Base index class.
-    Provides basic operations on index: opening, closing, optimizing.
-    """
-    def __init__(self, analyzer=None):
-        super(BaseIndex, self).__init__()
-        self.index = None
-        if not analyzer:
-            analyzer = WLAnalyzer()
-        self.analyzer = analyzer
-
-    def open(self, analyzer=None):
-        if self.index:
-            raise Exception("Index is already opened")
-        self.index = IndexWriter(self.store, self.analyzer,\
-                                 IndexWriter.MaxFieldLength.LIMITED)
-        return self.index
-
-    def optimize(self):
-        self.index.optimize()
+        if self.file:
+            self.file.close()
 
 
-    def close(self):
+    def remove(self):
+        self.revision = None
         try:
         try:
-            self.index.optimize()
-        except JavaError, je:
-            print "Error during optimize phase, check index: %s" % je
-
-        self.index.close()
-        self.index = None
-
-    def __enter__(self):
-        self.open()
-        return self
-
-    def __exit__(self, type, value, tb):
-        self.close()
+            os.unlink(self.path)
+            self.revision = 0
+            while True:
+                self.revision += 1
+                os.unlink(self.path)
+        except OSError:
+            pass
 
 
 
 
-class Index(BaseIndex):
+class Index(SolrIndex):
     """
     Class indexing books.
     """
     """
     Class indexing books.
     """
-    def __init__(self, analyzer=None):
-        super(Index, self).__init__(analyzer)
+    def __init__(self):
+        super(Index, self).__init__(mode='rw')
+
+    def delete_query(self, *queries):
+        """
+        index.delete(queries=...) doesn't work, so let's reimplement it
+        using deletion of list of uids.
+        """
+        uids = set()
+        for q in queries:
+            if isinstance(q, sunburnt.search.LuceneQuery):
+                q = self.index.query(q)
+            q.field_limiter.update(['uid'])
+            st = 0
+            rows = 100
+            while True:
+                ids = q.paginate(start=st, rows=rows).execute()
+                if not len(ids):
+                    break
+                for res in ids:
+                    uids.add(res['uid'])
+                st += rows
+        if uids:
+            self.index.delete(uids)
+            return True
+        else:
+            return False
 
 
-    def index_tags(self):
+    def index_tags(self, *tags, **kw):
         """
         Re-index global tag list.
         Removes all tags from index, then index them again.
         Indexed fields include: id, name (with and without polish stems), category
         """
         """
         Re-index global tag list.
         Removes all tags from index, then index them again.
         Indexed fields include: id, name (with and without polish stems), category
         """
-        q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
-        self.index.deleteDocuments(q)
-
-        for tag in catalogue.models.Tag.objects.all():
-            doc = Document()
-            doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
-            doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
-            doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
-            doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
-            self.index.addDocument(doc)
+        log.debug("Indexing tags")
+        remove_only = kw.get('remove_only', False)
+        # first, remove tags from index.
+        if tags:
+            tag_qs = []
+            for tag in tags:
+                q_id = self.index.Q(tag_id=tag.id)
+
+                if isinstance(tag, PDCounterAuthor):
+                    q_cat = self.index.Q(tag_category='pd_author')
+                elif isinstance(tag, PDCounterBook):
+                    q_cat = self.index.Q(tag_category='pd_book')
+                else:
+                    q_cat = self.index.Q(tag_category=tag.category)
+
+                q_id_cat = self.index.Q(q_id & q_cat)
+                tag_qs.append(q_id_cat)
+            self.delete_query(*tag_qs)
+        else:  # all
+            q = self.index.Q(tag_id__any=True)
+            self.delete_query(q)
+
+        if not remove_only:
+            # then add them [all or just one passed]
+            if not tags:
+                tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
+                    PDCounterAuthor.objects.all(), \
+                    PDCounterBook.objects.all())
+
+            for tag in tags:
+                if isinstance(tag, PDCounterAuthor):
+                    doc = {
+                        "tag_id": int(tag.id),
+                        "tag_name": tag.name,
+                        "tag_name_pl": tag.name,
+                        "tag_category": 'pd_author',
+                        "is_pdcounter": True,
+                        "uid": "tag%d_pd_a" % tag.id
+                        }
+                elif isinstance(tag, PDCounterBook):
+                    doc = {
+                        "tag_id": int(tag.id),
+                        "tag_name": tag.title,
+                        "tag_name_pl": tag.title,
+                        "tag_category": 'pd_book',
+                        "is_pdcounter": True,
+                        "uid": "tag%d_pd_b" % tag.id
+                        }
+                else:
+                    doc = {
+                        "tag_id": int(tag.id),
+                        "tag_name": tag.name,
+                        "tag_name_pl": tag.name,
+                        "tag_category": tag.category,
+                        "is_pdcounter": False,
+                        "uid": "tag%d" % tag.id
+                        }
+                self.index.add(doc)
 
     def create_book_doc(self, book):
         """
         Create a lucene document referring book id.
         """
 
     def create_book_doc(self, book):
         """
         Create a lucene document referring book id.
         """
-        doc = Document()
-        doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
+        doc = {
+            'book_id': int(book.id),
+            }
         if book.parent is not None:
         if book.parent is not None:
-            doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
+            doc["parent_id"] = int(book.parent.id)
         return doc
 
         return doc
 
-    def remove_book(self, book):
+    def remove_book(self, book_or_id, remove_snippets=True):
         """Removes a book from search index.
         book - Book instance."""
         """Removes a book from search index.
         book - Book instance."""
-        q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
-        self.index.deleteDocuments(q)
+        if isinstance(book_or_id, catalogue.models.Book):
+            book_id = book_or_id.id
+        else:
+            book_id = book_or_id
+
+        self.delete_query(self.index.Q(book_id=book_id))
+
+        if remove_snippets:
+            snippets = Snippets(book_id)
+            snippets.remove()
 
     def index_book(self, book, book_info=None, overwrite=True):
         """
 
     def index_book(self, book, book_info=None, overwrite=True):
         """
@@ -242,21 +239,32 @@ class Index(BaseIndex):
         and calls self.index_content() to index the contents of the book.
         """
         if overwrite:
         and calls self.index_content() to index the contents of the book.
         """
         if overwrite:
-            self.remove_book(book)
+            # we don't remove snippets, since they might be still needed by
+            # threads using not reopened index
+            self.remove_book(book, remove_snippets=False)
 
         book_doc = self.create_book_doc(book)
 
         book_doc = self.create_book_doc(book)
-        meta_fields = self.extract_metadata(book, book_info)
-        for f in meta_fields.values():
-            if isinstance(f, list) or isinstance(f, tuple):
-                for elem in f:
-                    book_doc.add(elem)
-            else:
-                book_doc.add(f)
+        meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
+        # let's not index it - it's only used for extracting publish date
+        if 'source_name' in meta_fields:
+            del meta_fields['source_name']
 
 
-        self.index.addDocument(book_doc)
+        for n, f in meta_fields.items():
+            book_doc[n] = f
+
+        book_doc['uid'] = "book%s" % book_doc['book_id']
+        self.index.add(book_doc)
         del book_doc
         del book_doc
+        book_fields = {
+            'title': meta_fields['title'],
+            'authors': meta_fields['authors'],
+            'published_date': meta_fields['published_date']
+            }
+
+        if 'translators' in meta_fields:
+            book_fields['translators'] = meta_fields['translators']
 
 
-        self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
+        self.index_content(book, book_fields=book_fields)
 
     master_tags = [
         'opowiadanie',
 
     master_tags = [
         'opowiadanie',
@@ -280,7 +288,7 @@ class Index(BaseIndex):
 
     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 
 
     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 
-    def extract_metadata(self, book, book_info=None):
+    def extract_metadata(self, book, book_info=None, dc_only=None):
         """
         Extract metadata from book and returns a map of fields keyed by fieldname
         """
         """
         Extract metadata from book and returns a map of fields keyed by fieldname
         """
@@ -289,12 +297,14 @@ class Index(BaseIndex):
         if book_info is None:
             book_info = dcparser.parse(open(book.xml_file.path))
 
         if book_info is None:
             book_info = dcparser.parse(open(book.xml_file.path))
 
-        fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
-        fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
-        fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
+        fields['slug'] = book.slug
+        fields['tags'] = [t.name  for t in book.tags]
+        fields['is_book'] = True
 
         # validator, name
         for field in dcparser.BookInfo.FIELDS:
 
         # validator, name
         for field in dcparser.BookInfo.FIELDS:
+            if dc_only and field.name not in dc_only:
+                continue
             if hasattr(book_info, field.name):
                 if not getattr(book_info, field.name):
                     continue
             if hasattr(book_info, field.name):
                 if not getattr(book_info, field.name):
                     continue
@@ -304,39 +314,38 @@ class Index(BaseIndex):
                     s = getattr(book_info, field.name)
                     if field.multiple:
                         s = ', '.join(s)
                     s = getattr(book_info, field.name)
                     if field.multiple:
                         s = ', '.join(s)
-                    try:
-                        fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
-                    except JavaError as je:
-                        raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
+                    fields[field.name] = s
                 elif type_indicator == dcparser.as_person:
                     p = getattr(book_info, field.name)
                     if isinstance(p, dcparser.Person):
                         persons = unicode(p)
                     else:
                         persons = ', '.join(map(unicode, p))
                 elif type_indicator == dcparser.as_person:
                     p = getattr(book_info, field.name)
                     if isinstance(p, dcparser.Person):
                         persons = unicode(p)
                     else:
                         persons = ', '.join(map(unicode, p))
-                    fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
+                    fields[field.name] = persons
                 elif type_indicator == dcparser.as_date:
                     dt = getattr(book_info, field.name)
                 elif type_indicator == dcparser.as_date:
                     dt = getattr(book_info, field.name)
-                    fields[field.name] = Field(field.name, "%04d%02d%02d" %\
-                                               (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
+                    fields[field.name] = dt
 
         # get published date
 
         # get published date
-        source = book_info.source_name
-        match = self.published_date_re.search(source)
-        if match is not None:
-            fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
+        pd = None
+        if hasattr(book_info, 'source_name') and book_info.source_name:
+            match = self.published_date_re.search(book_info.source_name)
+            if match is not None:
+                pd = str(match.groups()[0])
+        if not pd: pd = ""
+        fields["published_date"] = pd
 
         return fields
 
 
         return fields
 
-    def add_gaps(self, fields, fieldname):
-        """
-        Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
-        This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
-        """
-        def gap():
-            while True:
-                yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
-        return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
+    def add_gaps(self, fields, fieldname):
+        """
+        Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
+        This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
+        """
+        def gap():
+            while True:
+                yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
+        return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 
     def get_master(self, root):
         """
 
     def get_master(self, root):
         """
@@ -346,7 +355,7 @@ class Index(BaseIndex):
             if master.tag in self.master_tags:
                 return master
 
             if master.tag in self.master_tags:
                 return master
 
-    def index_content(self, book, book_fields=[]):
+    def index_content(self, book, book_fields={}):
         """
         Walks the book XML and extract content from it.
         Adds parts for each header tag and for each fragment.
         """
         Walks the book XML and extract content from it.
         Adds parts for each header tag and for each fragment.
@@ -389,39 +398,31 @@ class Index(BaseIndex):
 
         def add_part(snippets, **fields):
             doc = self.create_book_doc(book)
 
         def add_part(snippets, **fields):
             doc = self.create_book_doc(book)
-            for f in book_fields:
-                doc.add(f)
+            for n, v in book_fields.items():
+                doc[n] = v
 
 
-            doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
-            doc.add(NumericField("header_span", Field.Store.YES, True)\
-                    .setIntValue('header_span' in fields and fields['header_span'] or 1))
-            doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
+            doc['header_index'] = fields["header_index"]
+            doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
+            doc['header_type'] = fields['header_type']
 
 
-            doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
-                          Field.TermVector.WITH_POSITIONS_OFFSETS))
+            doc['text'] = fields['text']
 
 
-            snip_pos = snippets.add(fields["content"])
-            doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
-            doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
+            # snippets
+            snip_pos = snippets.add(fields["text"])
+
+            doc['snippets_position'] = snip_pos[0]
+            doc['snippets_length'] = snip_pos[1]
+            if snippets.revision:
+                doc["snippets_revision"] = snippets.revision
 
             if 'fragment_anchor' in fields:
 
             if 'fragment_anchor' in fields:
-                doc.add(Field("fragment_anchor", fields['fragment_anchor'],
-                              Field.Store.YES, Field.Index.NOT_ANALYZED))
+                doc["fragment_anchor"] = fields['fragment_anchor']
 
             if 'themes' in fields:
 
             if 'themes' in fields:
-                themes, themes_pl = zip(*[
-                    (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
-                     Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
-                     for theme in fields['themes']])
-
-                themes = self.add_gaps(themes, 'themes')
-                themes_pl = self.add_gaps(themes_pl, 'themes_pl')
-
-                for t in themes:
-                    doc.add(t)
-                for t in themes_pl:
-                    doc.add(t)
-
+                doc['themes'] = fields['themes']
+            doc['uid'] = "part%s%s%s" % (doc['header_index'],
+                                         doc['header_span'],
+                                         doc.get('fragment_anchor', ''))
             return doc
 
         def give_me_utf8(s):
             return doc
 
         def give_me_utf8(s):
@@ -446,39 +447,38 @@ class Index(BaseIndex):
 
                 def all_content(text):
                     for frag in fragments.values():
 
                 def all_content(text):
                     for frag in fragments.values():
-                        frag['content'].append(text)
+                        frag['text'].append(text)
                     content.append(text)
                 handle_text = [all_content]
 
                     content.append(text)
                 handle_text = [all_content]
 
-
                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
                     # handle footnotes
                     if start is not None and start.tag in self.footnote_tags:
                         footnote = []
                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
                     # handle footnotes
                     if start is not None and start.tag in self.footnote_tags:
                         footnote = []
+
                         def collect_footnote(t):
                             footnote.append(t)
                         def collect_footnote(t):
                             footnote.append(t)
+
                         handle_text.append(collect_footnote)
                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
                         handle_text.pop()
                         doc = add_part(snippets, header_index=position, header_type=header.tag,
                         handle_text.append(collect_footnote)
                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
                         handle_text.pop()
                         doc = add_part(snippets, header_index=position, header_type=header.tag,
-                                       content=u''.join(footnote),
-                                       is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
-                
-                        self.index.addDocument(doc)
-                        print "@ footnote text: %s" % footnote
+                                       text=u''.join(footnote),
+                                       is_footnote=True)
+                        self.index.add(doc)
                         footnote = []
                         footnote = []
-                    
+
                     # handle fragments and themes.
                     if start is not None and start.tag == 'begin':
                         fid = start.attrib['id'][1:]
                     # handle fragments and themes.
                     if start is not None and start.tag == 'begin':
                         fid = start.attrib['id'][1:]
-                        fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
+                        fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 
                     # themes for this fragment
                     elif start is not None and start.tag == 'motyw':
                         fid = start.attrib['id'][1:]
                         handle_text.append(None)
                         if start.text is not None:
 
                     # themes for this fragment
                     elif start is not None and start.tag == 'motyw':
                         fid = start.attrib['id'][1:]
                         handle_text.append(None)
                         if start.text is not None:
-                            fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
+                            fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
                     elif end is not None and end.tag == 'motyw':
                         handle_text.pop()
 
                     elif end is not None and end.tag == 'motyw':
                         handle_text.pop()
 
@@ -496,10 +496,9 @@ class Index(BaseIndex):
                                        header_index=frag['start_section'],
                                        header_span=position - frag['start_section'] + 1,
                                        fragment_anchor=fid,
                                        header_index=frag['start_section'],
                                        header_span=position - frag['start_section'] + 1,
                                        fragment_anchor=fid,
-                                       content=fix_format(frag['content']),
+                                       text=fix_format(frag['text']),
                                        themes=frag['themes'])
                                        themes=frag['themes'])
-                        print '@ FRAG %s' % frag['content']
-                        self.index.addDocument(doc)
+                        self.index.add(doc)
 
                         # Collect content.
 
 
                         # Collect content.
 
@@ -509,145 +508,63 @@ class Index(BaseIndex):
                             hdl(text)
 
                         # in the end, add a section text.
                             hdl(text)
 
                         # in the end, add a section text.
-                doc = add_part(snippets, header_index=position, header_type=header.tag,
-                               content=fix_format(content))
-                print '@ CONTENT: %s' % fix_format(content)
+                doc = add_part(snippets, header_index=position,
+                               header_type=header.tag, text=fix_format(content))
 
 
-                self.index.addDocument(doc)
+                self.index.add(doc)
 
         finally:
             snippets.close()
 
 
 
         finally:
             snippets.close()
 
 
-def log_exception_wrapper(f):
-    def _wrap(*a):
-        try:
-            f(*a)
-        except Exception, e:
-            print("Error in indexing thread: %s" % e)
-            traceback.print_exc()
-            raise e
-    return _wrap
-
-
-class ReusableIndex(Index):
-    """
-    Works like index, but does not close/optimize Lucene index
-    until program exit (uses atexit hook).
-    This is usefull for importbooks command.
-
-    if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
-    """
-    index = None
-
-    def open(self, analyzer=None, threads=4):
-        if ReusableIndex.index is not None:
-            self.index = ReusableIndex.index
-        else:
-            print("opening index")
-            Index.open(self, analyzer)
-            ReusableIndex.index = self.index
-            atexit.register(ReusableIndex.close_reusable)
-
-    # def index_book(self, *args, **kw):
-    #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
-    #     ReusableIndex.pool_jobs.append(job)
-
-    @staticmethod
-    def close_reusable():
-        if ReusableIndex.index is not None:
-            ReusableIndex.index.optimize()
-            ReusableIndex.index.close()
-            ReusableIndex.index = None
-
-    def close(self):
-        pass
-
-
-class JoinSearch(object):
-    """
-    This mixin could be used to handle block join queries.
-    (currently unused)
-    """
-    def __init__(self, *args, **kw):
-        super(JoinSearch, self).__init__(*args, **kw)
-
-    def wrapjoins(self, query, fields=[]):
-        """
-        This functions modifies the query in a recursive way,
-        so Term and Phrase Queries contained, which match
-        provided fields are wrapped in a BlockJoinQuery,
-        and so delegated to children documents.
-        """
-        if BooleanQuery.instance_(query):
-            qs = BooleanQuery.cast_(query)
-            for clause in qs:
-                clause = BooleanClause.cast_(clause)
-                clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
-            return qs
-        else:
-            termset = HashSet()
-            query.extractTerms(termset)
-            for t in termset:
-                t = Term.cast_(t)
-                if t.field() not in fields:
-                    return query
-            return BlockJoinQuery(query, self.parent_filter,
-                                  BlockJoinQuery.ScoreMode.Total)
-
-    def bsearch(self, query, max_results=50):
-        q = self.query(query)
-        bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
-
-        tops = self.searcher.search(bjq, max_results)
-        bks = []
-        for found in tops.scoreDocs:
-            doc = self.searcher.doc(found.doc)
-            bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
-        return (bks, tops.totalHits)
-
-
 class SearchResult(object):
 class SearchResult(object):
-    def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
-        if tokens_cache is None: tokens_cache = {}
-
-        if score:
-            self._score = score
-        else:
-            self._score = scoreDocs.score
-
+    def __init__(self, doc, how_found=None, query=None, query_terms=None):
+        #        self.search = search
         self.boost = 1.0
         self.boost = 1.0
-
         self._hits = []
         self._processed_hits = None  # processed hits
         self._hits = []
         self._processed_hits = None  # processed hits
+        self.snippets = []
+        self.query_terms = query_terms
 
 
-        stored = search.searcher.doc(scoreDocs.doc)
-        self.book_id = int(stored.get("book_id"))
-
-        header_type = stored.get("header_type")
-        if not header_type:
-            return
-
-        sec = (header_type, int(stored.get("header_index")))
-        header_span = stored.get('header_span')
-        header_span = header_span is not None and int(header_span) or 1
-
-        fragment = stored.get("fragment_anchor")
+        if 'score' in doc:
+            self._score = doc['score']
+        else:
+            self._score = 0
 
 
-        pd = stored.get("published_date")
-        if pd is None:
-            pd = 0
-        self.published_date = int(pd)
+        self.book_id = int(doc["book_id"])
 
 
-        if snippets:
-            snippets = snippets.replace("/\n", "\n")
-        hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
+        try:
+            self.published_date = int(doc.get("published_date"))
+        except ValueError:
+            self.published_date = 0
+
+        # content hits
+        header_type = doc.get("header_type", None)
+        # we have a content hit in some header of fragment
+        if header_type is not None:
+            sec = (header_type, int(doc["header_index"]))
+            header_span = doc['header_span']
+            header_span = header_span is not None and int(header_span) or 1
+            fragment = doc.get("fragment_anchor", None)
+            snippets_pos = (doc['snippets_position'], doc['snippets_length'])
+            snippets_rev = doc.get('snippets_revision', None)
+
+            hit = (sec + (header_span,), fragment, self._score, {
+                'how_found': how_found,
+                'snippets_pos': snippets_pos,
+                'snippets_revision': snippets_rev,
+                'themes': doc.get('themes', []),
+                'themes_pl': doc.get('themes_pl', [])
+                })
+
+            self._hits.append(hit)
 
 
-        self._hits.append(hit)
+    def __unicode__(self):
+        return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
+            (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
 
 
-        self.search = search
-        self.searched = searched
-        self.tokens_cache = tokens_cache
+    def __str__(self):
+        return unicode(self).encode('utf-8')
 
     @property
     def score(self):
 
     @property
     def score(self):
@@ -662,87 +579,104 @@ class SearchResult(object):
         return self
 
     def get_book(self):
         return self
 
     def get_book(self):
-        return catalogue.models.Book.objects.get(id=self.book_id)
+        if hasattr(self, '_book'):
+            return self._book
+        self._book = catalogue.models.Book.objects.get(id=self.book_id)
+        return self._book
 
     book = property(get_book)
 
 
     book = property(get_book)
 
+    POSITION = 0
+    FRAGMENT = 1
+    POSITION_INDEX = 1
+    POSITION_SPAN = 2
+    SCORE = 2
+    OTHER = 3
+
     @property
     def hits(self):
         if self._processed_hits is not None:
             return self._processed_hits
 
     @property
     def hits(self):
         if self._processed_hits is not None:
             return self._processed_hits
 
-        POSITION = 0
-        FRAGMENT = 1
-        POSITION_INDEX = 1
-        POSITION_SPAN = 2
-        SCORE = 2
-        OTHER = 3
-
         # to sections and fragments
         # to sections and fragments
-        frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
-        sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
+        frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
+
+        sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
+
+        # sections not covered by fragments
         sect = filter(lambda s: 0 == len(filter(
         sect = filter(lambda s: 0 == len(filter(
-            lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
-            and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
+            lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
+            and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
             frags)), sect)
 
         hits = []
 
             frags)), sect)
 
         hits = []
 
-        # remove duplicate fragments
-        fragments = {}
-        for f in frags:
-            fid = f[FRAGMENT]
-            if fid in fragments:
-                if fragments[fid][SCORE] >= f[SCORE]:
-                    continue
-            fragments[fid] = f
-        frags = fragments.values()
+        def remove_duplicates(lst, keyfn, compare):
+            els = {}
+            for e in lst:
+                eif = keyfn(e)
+                if eif in els:
+                    if compare(els[eif], e) >= 1:
+                        continue
+                els[eif] = e
+            return els.values()
+
+        # remove fragments with duplicated fid's and duplicated snippets
+        frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
+        # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
+        #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
 
         # remove duplicate sections
         sections = {}
 
         for s in sect:
 
         # remove duplicate sections
         sections = {}
 
         for s in sect:
-            si = s[POSITION][POSITION_INDEX]
+            si = s[self.POSITION][self.POSITION_INDEX]
             # skip existing
             if si in sections:
             # skip existing
             if si in sections:
-                if sections[si]['score'] >= s[SCORE]:
+                if sections[si]['score'] >= s[self.SCORE]:
                     continue
 
                     continue
 
-            m = {'score': s[SCORE],
-                 'section_number': s[POSITION][POSITION_INDEX] + 1,
+            m = {'score': s[self.SCORE],
+                 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
                  }
                  }
-            m.update(s[OTHER])
+            m.update(s[self.OTHER])
             sections[si] = m
 
         hits = sections.values()
 
         for f in frags:
             try:
             sections[si] = m
 
         hits = sections.values()
 
         for f in frags:
             try:
-                frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
+                frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
             except catalogue.models.Fragment.DoesNotExist:
                 # stale index
                 continue
             except catalogue.models.Fragment.DoesNotExist:
                 # stale index
                 continue
-
             # Figure out if we were searching for a token matching some word in theme name.
             themes = frag.tags.filter(category='theme')
             # Figure out if we were searching for a token matching some word in theme name.
             themes = frag.tags.filter(category='theme')
-            themes_hit = []
-            if self.searched is not None:
-                tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
-                for theme in themes:
-                    name_tokens = self.search.get_tokens(theme.name, 'POLISH')
-                    for t in tokens:
-                        if t in name_tokens:
-                            if not theme in themes_hit:
-                                themes_hit.append(theme)
+            themes_hit = set()
+            if self.query_terms is not None:
+                for i in range(0, len(f[self.OTHER]['themes'])):
+                    tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
+                    tms = map(unicode.lower, tms)
+                    for qt in self.query_terms:
+                        if qt in tms:
+                            themes_hit.add(f[self.OTHER]['themes'][i])
                             break
 
                             break
 
-            m = {'score': f[SCORE],
+            def theme_by_name(n):
+                th = filter(lambda t: t.name == n, themes)
+                if th:
+                    return th[0]
+                else:
+                    return None
+            themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
+
+            m = {'score': f[self.SCORE],
                  'fragment': frag,
                  'fragment': frag,
-                 'section_number': f[POSITION][POSITION_INDEX] + 1,
+                 'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
                  'themes': themes,
                  'themes_hit': themes_hit
                  }
                  'themes': themes,
                  'themes_hit': themes_hit
                  }
-            m.update(f[OTHER])
+            m.update(f[self.OTHER])
             hits.append(m)
 
         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
             hits.append(m)
 
         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
@@ -751,9 +685,6 @@ class SearchResult(object):
 
         return hits
 
 
         return hits
 
-    def __unicode__(self):
-        return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
-
     @staticmethod
     def aggregate(*result_lists):
         books = {}
     @staticmethod
     def aggregate(*result_lists):
         books = {}
@@ -761,7 +692,6 @@ class SearchResult(object):
             for r in rl:
                 if r.book_id in books:
                     books[r.book_id].merge(r)
             for r in rl:
                 if r.book_id in books:
                     books[r.book_id].merge(r)
-                    #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
                 else:
                     books[r.book_id] = r
         return books.values()
                 else:
                     books[r.book_id] = r
         return books.values()
@@ -774,559 +704,239 @@ class SearchResult(object):
         else:
             return c
 
         else:
             return c
 
+    def __len__(self):
+        return len(self.hits)
 
 
-class Hint(object):
-    """
-    Given some hint information (information we already know about)
-    our search target - like author, title (specific book), epoch, genre, kind
-    we can narrow down search using filters.
-    """
-    def __init__(self, search):
-        """
-        Accepts a Searcher instance.
-        """
-        self.search = search
-        self.book_tags = {}
-        self.part_tags = []
-        self._books = []
-
-    def books(self, *books):
-        """
-        Give a hint that we search these books.
-        """
-        self._books = books
-
-    def tags(self, tags):
-        """
-        Give a hint that these Tag objects (a list of)
-        is necessary.
-        """
-        for t in tags:
-            if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
-                lst = self.book_tags.get(t.category, [])
-                lst.append(t)
-                self.book_tags[t.category] = lst
-            if t.category in ['theme', 'theme_pl']:
-                self.part_tags.append(t)
-
-    def tag_filter(self, tags, field='tags'):
-        """
-        Given a lsit of tags and an optional field (but they are normally in tags field)
-        returns a filter accepting only books with specific tags.
-        """
-        q = BooleanQuery()
-
-        for tag in tags:
-            toks = self.search.get_tokens(tag.name, field=field)
-            tag_phrase = PhraseQuery()
-            for tok in toks:
-                tag_phrase.add(Term(field, tok))
-            q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
-
-        return QueryWrapperFilter(q)
+    def snippet_pos(self, idx=0):
+        return self.hits[idx]['snippets_pos']
 
 
-    def book_filter(self):
-        """
-        Filters using book tags (all tag kinds except a theme)
-        """
-        tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
-        if tags:
-            return self.tag_filter(tags)
-        else:
+    def snippet_revision(self, idx=0):
+        try:
+            return self.hits[idx]['snippets_revision']
+        except:
             return None
 
             return None
 
-    def part_filter(self):
-        """
-        This filter can be used to look for book parts.
-        It filters on book id and/or themes.
-        """
-        fs = []
-        if self.part_tags:
-            fs.append(self.tag_filter(self.part_tags, field='themes'))
-
-        if self._books != []:
-            bf = BooleanFilter()
-            for b in self._books:
-                id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
-                bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
-            fs.append(bf)
-
-        return Search.chain_filters(fs)
-
-    def should_search_for_book(self):
-        return self._books == []
-
-    def just_search_in(self, all):
-        """Holds logic to figure out which indexes should be search, when we have some hinst already"""
-        some = []
-        for field in all:
-            if field == 'authors' and 'author' in self.book_tags:
-                continue
-            if field == 'title' and self._books != []:
-                continue
-            if (field == 'themes' or field == 'themes_pl') and self.part_tags:
-                continue
-            some.append(field)
-        return some
 
 
-
-class Search(IndexStore):
+class Search(SolrIndex):
     """
     Search facilities.
     """
     """
     Search facilities.
     """
-    def __init__(self, default_field="content"):
-        IndexStore.__init__(self)
-        self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
-        # self.analyzer = WLAnalyzer()
-        self.searcher = IndexSearcher(self.store, True)
-        self.parser = QueryParser(Version.LUCENE_34, default_field,
-                                  self.analyzer)
-
-        self.parent_filter = TermsFilter()
-        self.parent_filter.addTerm(Term("is_book", "true"))
-
-    def query(self, query):
-        """Parse query in default Lucene Syntax. (for humans)
-        """
-        return self.parser.parse(query)
-
-    def simple_search(self, query, max_results=50):
-        """Runs a query for books using lucene syntax. (for humans)
-        Returns (books, total_hits)
-        """
-
-        tops = self.searcher.search(self.query(query), max_results)
-        bks = []
-        for found in tops.scoreDocs:
-            doc = self.searcher.doc(found.doc)
-            bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
-        return (bks, tops.totalHits)
-
-    def get_tokens(self, searched, field='content', cached=None):
-        """returns tokens analyzed by a proper (for a field) analyzer
-        argument can be: StringReader, string/unicode, or tokens. In the last case
-        they will just be returned (so we can reuse tokens, if we don't change the analyzer)
-        """
-        if cached is not None and field in cached:
-            return cached[field]
-
-        if isinstance(searched, str) or isinstance(searched, unicode):
-            searched = StringReader(searched)
-        elif isinstance(searched, list):
-            return searched
+    def __init__(self, default_field="text"):
+        super(Search, self).__init__(mode='r')
 
 
-        searched.reset()
-        tokens = self.analyzer.reusableTokenStream(field, searched)
-        toks = []
-        while tokens.incrementToken():
-            cta = tokens.getAttribute(CharTermAttribute.class_)
-            toks.append(cta.toString())
 
 
-        if cached is not None:
-            cached[field] = toks
-
-        return toks
-
-    def fuzziness(self, fuzzy):
-        """Helper method to sanitize fuzziness"""
-        if not fuzzy:
-            return None
-        if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
-            return fuzzy
-        else:
-            return 0.5
-
-    def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
-        """
-        Return a PhraseQuery with a series of tokens.
-        """
-        if fuzzy:
-            phrase = MultiPhraseQuery()
-            for t in tokens:
-                term = Term(field, t)
-                fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
-                fuzzterms = []
-
-                while True:
-                    #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
-                    ft = fuzzterm.term()
-                    if ft:
-                        fuzzterms.append(ft)
-                    if not fuzzterm.next(): break
-                if fuzzterms:
-                    phrase.add(JArray('object')(fuzzterms, Term))
-                else:
-                    phrase.add(term)
-        else:
-            phrase = PhraseQuery()
-            phrase.setSlop(slop)
-            for t in tokens:
-                term = Term(field, t)
-                phrase.add(term)
-        return phrase
-
-    def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
+    def make_term_query(self, query, field='text', modal=operator.or_):
         """
         Returns term queries joined by boolean query.
         modal - applies to boolean query
         fuzzy - should the query by fuzzy.
         """
         """
         Returns term queries joined by boolean query.
         modal - applies to boolean query
         fuzzy - should the query by fuzzy.
         """
-        q = BooleanQuery()
-        for t in tokens:
-            term = Term(field, t)
-            if fuzzy:
-                term = FuzzyQuery(term, self.fuzziness(fuzzy))
-            else:
-                term = TermQuery(term)
-            q.add(BooleanClause(term, modal))
+        if query is None: query = ''
+        q = self.index.Q()
+        q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
+                        query.split(r" ")), q)
+
         return q
 
         return q
 
-    def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
-                      filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
+    def search_phrase(self, searched, field='text', book=False,
+                      filters=None,
+                      snippets=False):
         if filters is None: filters = []
         if filters is None: filters = []
-        if tokens_cache is None: tokens_cache = {}
-
-        tokens = self.get_tokens(searched, field, cached=tokens_cache)
+        if book: filters.append(self.index.Q(is_book=True))
 
 
-        query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
-        if book:
-            filters.append(self.term_filter(Term('is_book', 'true')))
-        top = self.searcher.search(query, self.chain_filters(filters), max_results)
+        q = self.index.query(**{field: searched})
+        q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
+        res = q.execute()
+        return [SearchResult(found, how_found=u'search_phrase') for found in res]
 
 
-        return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
-
-    def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
-                    filters=None, tokens_cache=None, boost=None, snippets=True):
+    def search_some(self, searched, fields, book=True,
+                    filters=None, snippets=True, query_terms=None):
+        assert isinstance(fields, list)
         if filters is None: filters = []
         if filters is None: filters = []
-        if tokens_cache is None: tokens_cache = {}
-
-        if book:
-            filters.append(self.term_filter(Term('is_book', 'true')))
+        if book: filters.append(self.index.Q(is_book=True))
 
 
-        query = BooleanQuery()
+        query = self.index.Q()
 
         for fld in fields:
 
         for fld in fields:
-            tokens = self.get_tokens(searched, fld, cached=tokens_cache)
-
-            query.add(BooleanClause(self.make_term_query(tokens, field=fld,
-                                fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
-
-        top = self.searcher.search(query, self.chain_filters(filters), max_results)
-
-        return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
-                             snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
-
-    def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
-        """
-        Search for perfect book matches. Just see if the query matches with some author or title,
-        taking hints into account.
-        """
-        fields_to_search = ['authors', 'title']
-        only_in = None
-        if hint:
-            if not hint.should_search_for_book():
-                return []
-            fields_to_search = hint.just_search_in(fields_to_search)
-            only_in = hint.book_filter()
-
-        qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
-
-        books = []
-        for q in qrys:
-            top = self.searcher.search(q,
-                self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
-                max_results)
-            for found in top.scoreDocs:
-                books.append(SearchResult(self, found, how_found="search_perfect_book"))
-        return books
-
-    def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
-        fields_to_search = ['tags', 'authors', 'title']
-
-        only_in = None
-        if hint:
-            if not hint.should_search_for_book():
-                return []
-            fields_to_search = hint.just_search_in(fields_to_search)
-            only_in = hint.book_filter()
+            query = self.index.Q(query | self.make_term_query(searched, fld))
 
 
-        tokens = self.get_tokens(searched, field='SIMPLE')
-
-        q = BooleanQuery()
-
-        for fld in fields_to_search:
-            q.add(BooleanClause(self.make_term_query(tokens, field=fld,
-                                fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
-
-        books = []
-        top = self.searcher.search(q,
-                                   self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
-            max_results)
-        for found in top.scoreDocs:
-            books.append(SearchResult(self, found, how_found="search_book"))
+        query = self.index.query(query)
+        query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
+        res = query.execute()
+        return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
 
 
-        return books
 
 
-    def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
-        """
-        Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
-        some part/fragment of the book.
-        """
-        qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
-
-        flt = None
-        if hint:
-            flt = hint.part_filter()
-
-        books = []
-        for q in qrys:
-            top = self.searcher.search(q,
-                                       self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
-                                                           flt]),
-                                       max_results)
-            for found in top.scoreDocs:
-                books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
-
-        return books
-
-    def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
+    def search_everywhere(self, searched, query_terms=None):
         """
         Tries to use search terms to match different fields of book (or its parts).
         E.g. one word can be an author survey, another be a part of the title, and the rest
         are some words from third chapter.
         """
         """
         Tries to use search terms to match different fields of book (or its parts).
         E.g. one word can be an author survey, another be a part of the title, and the rest
         are some words from third chapter.
         """
-        if tokens_cache is None: tokens_cache = {}
         books = []
         books = []
-        only_in = None
-
-        if hint:
-            only_in = hint.part_filter()
-
         # content only query : themes x content
         # content only query : themes x content
-        q = BooleanQuery()
-
-        tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
-        tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
+        q = self.make_term_query(searched, 'text')
+        q_themes = self.make_term_query(searched, 'themes_pl')
 
 
-        # only search in themes when we do not already filter by themes
-        if hint is None or hint.just_search_in(['themes']) != []:
-            q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
-                                                     fuzzy=fuzzy), BooleanClause.Occur.MUST))
+        query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
+        res = query.execute()
 
 
-        q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
-                                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
-
-        topDocs = self.searcher.search(q, only_in, max_results)
-        for found in topDocs.scoreDocs:
-            books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
-            print "* %s theme x content: %s" % (searched, books[-1]._hits)
+        for found in res:
+            books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
 
         # query themes/content x author/title/tags
 
         # query themes/content x author/title/tags
-        q = BooleanQuery()
-        in_content = BooleanQuery()
-        in_meta = BooleanQuery()
+        in_content = self.index.Q()
+        in_meta = self.index.Q()
 
 
-        for fld in ['themes_pl', 'content']:
-            in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
+        for fld in ['themes_pl', 'text']:
+            in_content |= self.make_term_query(searched, field=fld)
 
         for fld in ['tags', 'authors', 'title']:
 
         for fld in ['tags', 'authors', 'title']:
-            in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
+            in_meta |= self.make_term_query(searched, field=fld)
 
 
-        q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
-        q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
+        q = in_content & in_meta
+        res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
 
 
-        topDocs = self.searcher.search(q, only_in, max_results)
-        for found in topDocs.scoreDocs:
-            books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
-            print "* %s scatter search: %s" % (searched, books[-1]._hits)
+        for found in res:
+            books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
 
         return books
 
 
         return books
 
-    # def multisearch(self, query, max_results=50):
-    #     """
-    #     Search strategy:
-    #     - (phrase) OR -> content
-    #                   -> title
-    #                   -> authors
-    #     - (keywords)  -> authors
-    #                   -> motyw
-    #                   -> tags
-    #                   -> content
-    #     """
-        # queryreader = StringReader(query)
-        # tokens = self.get_tokens(queryreader)
-
-        # top_level = BooleanQuery()
-        # Should = BooleanClause.Occur.SHOULD
-
-        # phrase_level = BooleanQuery()
-        # phrase_level.setBoost(1.3)
-
-        # p_content = self.make_phrase(tokens, joined=True)
-        # p_title = self.make_phrase(tokens, 'title')
-        # p_author = self.make_phrase(tokens, 'author')
-
-        # phrase_level.add(BooleanClause(p_content, Should))
-        # phrase_level.add(BooleanClause(p_title, Should))
-        # phrase_level.add(BooleanClause(p_author, Should))
-
-        # kw_level = BooleanQuery()
-
-        # kw_level.add(self.make_term_query(tokens, 'author'), Should)
-        # j_themes = self.make_term_query(tokens, 'themes', joined=True)
-        # kw_level.add(j_themes, Should)
-        # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
-        # j_con = self.make_term_query(tokens, joined=True)
-        # kw_level.add(j_con, Should)
-
-        # top_level.add(BooleanClause(phrase_level, Should))
-        # top_level.add(BooleanClause(kw_level, Should))
-
-        # return None
-
-    def get_snippets(self, scoreDoc, query, field='content'):
+    def get_snippets(self, searchresult, query, field='text', num=1):
         """
         Returns a snippet for found scoreDoc.
         """
         """
         Returns a snippet for found scoreDoc.
         """
-        htmlFormatter = SimpleHTMLFormatter()
-        highlighter = Highlighter(htmlFormatter, QueryScorer(query))
-
-        stored = self.searcher.doc(scoreDoc.doc)
-
-        position = stored.get('snippets_position')
-        length = stored.get('snippets_length')
-        if position is None or length is None:
-            return None
-        # locate content.
-        snippets = Snippets(stored.get('book_id')).open()
+        maxnum = len(searchresult)
+        if num is None or num < 0 or num > maxnum:
+            num = maxnum
+        book_id = searchresult.book_id
+        revision = searchresult.snippet_revision()
+        snippets = Snippets(book_id, revision=revision)
+        snips = [None] * maxnum
         try:
         try:
-            text = snippets.get((int(position),
-                                 int(length)))
+            snippets.open()
+            idx = 0
+            while idx < maxnum and num > 0:
+                position, length = searchresult.snippet_pos(idx)
+                if position is None or length is None:
+                    continue
+                text = snippets.get((int(position),
+                                     int(length)))
+                snip = self.index.highlight(text=text, field=field, q=query)
+                snips[idx] = snip
+                if snip:
+                    num -= 1
+                idx += 1
+
+        except IOError, e:
+            log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
+            return []
         finally:
             snippets.close()
 
         finally:
             snippets.close()
 
-        tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
-        #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
-        snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
+            # remove verse end markers..
+        snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
 
 
-        return snip
+        searchresult.snippets = snips
 
 
-    @staticmethod
-    def enum_to_array(enum):
+        return snips
+
+    def hint_tags(self, query, pdcounter=True, prefix=True):
         """
         """
-        Converts a lucene TermEnum to array of Terms, suitable for
-        addition to queries
+        Return auto-complete hints for tags
+        using prefix search.
         """
         """
-        terms = []
-
-        while True:
-            t = enum.term()
-            if t:
-                terms.append(t)
-            if not enum.next(): break
+        q = self.index.Q()
+        query = query.strip()
+        for field in ['tag_name', 'tag_name_pl']:
+            if prefix:
+                q |= self.index.Q(**{field: query + "*"})
+            else:
+                q |= self.make_term_query(query, field=field)
+        qu = self.index.query(q).exclude(tag_category="book")
 
 
-        if terms:
-            return JArray('object')(terms, Term)
+        return self.search_tags(qu, pdcounter=pdcounter)
 
 
-    def search_tags(self, query, filter=None, max_results=40):
+    def search_tags(self, query, filters=None, pdcounter=False):
         """
         Search for Tag objects using query.
         """
         """
         Search for Tag objects using query.
         """
-        tops = self.searcher.search(query, filter, max_results)
+        if not filters: filters = []
+        if not pdcounter:
+            filters.append(~self.index.Q(is_pdcounter=True))
+        res = self.apply_filters(query, filters).execute()
 
         tags = []
 
         tags = []
-        for found in tops.scoreDocs:
-            doc = self.searcher.doc(found.doc)
-            tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
-            tags.append(tag)
-            print "%s (%d) -> %f" % (tag, tag.id, found.score)
-
-        return tags
-
-    def search_books(self, query, filter=None, max_results=10):
-        """
-        Searches for Book objects using query
-        """
-        bks = []
-        tops = self.searcher.search(query, filter, max_results)
-        for found in tops.scoreDocs:
-            doc = self.searcher.doc(found.doc)
-            bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
-        return bks
+        pd_tags = []
 
 
-    def create_prefix_phrase(self, toks, field):
-        q = MultiPhraseQuery()
-        for i in range(len(toks)):
-            t = Term(field, toks[i])
-            if i == len(toks) - 1:
-                pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
-                if pterms:
-                    q.add(pterms)
+        for doc in res:
+            is_pdcounter = doc.get('is_pdcounter', False)
+            category = doc.get('tag_category')
+            try:
+                if is_pdcounter == True:
+                    if category == 'pd_author':
+                        tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
+                    elif category == 'pd_book':
+                        tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
+                        tag.category = 'pd_book'  # make it look more lik a tag.
+                    else:
+                        print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)).encode('utf-8')
+                    pd_tags.append(tag)
                 else:
                 else:
-                    q.add(t)
-            else:
-                q.add(t)
-        return q
+                    tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
+                    tags.append(tag)
 
 
-    @staticmethod
-    def term_filter(term, inverse=False):
-        only_term = TermsFilter()
-        only_term.addTerm(term)
-
-        if inverse:
-            neg = BooleanFilter()
-            neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
-            only_term = neg
-
-        return only_term
-
-    def hint_tags(self, string, max_results=50):
-        """
-        Return auto-complete hints for tags
-        using prefix search.
-        """
-        toks = self.get_tokens(string, field='SIMPLE')
-        top = BooleanQuery()
+            except catalogue.models.Tag.DoesNotExist: pass
+            except PDCounterAuthor.DoesNotExist: pass
+            except PDCounterBook.DoesNotExist: pass
 
 
-        for field in ['tag_name', 'tag_name_pl']:
-            q = self.create_prefix_phrase(toks, field)
-            top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
+        tags_slugs = set(map(lambda t: t.slug, tags))
+        tags = tags + filter(lambda t: not t.slug in tags_slugs, pd_tags)
 
 
-        no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
+        log.debug('search_tags: %s' % tags)
 
 
-        return self.search_tags(top, no_book_cat, max_results=max_results)
+        return tags
 
 
-    def hint_books(self, string, max_results=50):
+    def hint_books(self, query, prefix=True):
         """
         Returns auto-complete hints for book titles
         Because we do not index 'pseudo' title-tags.
         Prefix search.
         """
         """
         Returns auto-complete hints for book titles
         Because we do not index 'pseudo' title-tags.
         Prefix search.
         """
-        toks = self.get_tokens(string, field='SIMPLE')
-
-        q = self.create_prefix_phrase(toks, 'title')
-
-        return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
+        q = self.index.Q()
+        query = query.strip()
+        if prefix:
+            q |= self.index.Q(title=query + "*")
+        else:
+            q |= self.make_term_query(query, field='title')
+        qu = self.index.query(q)
+        only_books = self.index.Q(is_book=True)
+        return self.search_books(qu, [only_books])
 
 
-    @staticmethod
-    def chain_filters(filters, op=ChainedFilter.AND):
+    def search_books(self, query, filters=None, max_results=10):
         """
         """
-        Chains a filter list together
+        Searches for Book objects using query
         """
         """
-        filters = filter(lambda x: x is not None, filters)
-        if not filters or filters is []:
-            return None
-        chf = ChainedFilter(JArray('object')(filters, Filter), op)
-        return chf
+        bks = []
+        bks_found = set()
+        query = query.query(is_book=True)
+        res = self.apply_filters(query, filters).field_limit(['book_id'])
+        for r in res:
+            try:
+                bid = r['book_id']
+                if not bid in bks_found:
+                    bks.append(catalogue.models.Book.objects.get(id=bid))
+                    bks_found.add(bid)
+            except catalogue.models.Book.DoesNotExist: pass
+        return bks
 
 
-    def filtered_categories(self, tags):
+    @staticmethod
+    def apply_filters(query, filters):
         """
         """
-        Return a list of tag categories, present in tags list.
+        Apply filters to a query
         """
         """
-        cats = {}
-        for t in tags:
-            cats[t.category] = True
-        return cats.keys()
-
-    def hint(self):
-        return Hint(self)
+        if filters is None: filters = []
+        filters = filter(lambda x: x is not None, filters)
+        for f in filters:
+            query = query.query(f)
+        return query