problems with snippets---outofmemoryexception
[wolnelektury.git] / apps / search / index.py
index f5489a5..a19dd69 100644 (file)
@@ -1,22 +1,34 @@
 # -*- coding: utf-8 -*-
 # -*- coding: utf-8 -*-
+
 from django.conf import settings
 from django.conf import settings
-from lucene import SimpleFSDirectory, IndexWriter, File, Field, \
+from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
+    File, Field, \
     NumericField, Version, Document, JavaError, IndexSearcher, \
     NumericField, Version, Document, JavaError, IndexSearcher, \
-    QueryParser, Term, PerFieldAnalyzerWrapper, \
+    QueryParser, PerFieldAnalyzerWrapper, \
     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
     KeywordAnalyzer, NumericRangeQuery, BooleanQuery, \
     BlockJoinQuery, BlockJoinCollector, TermsFilter, \
     HashSet, BooleanClause, Term, CharTermAttribute, \
     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
     KeywordAnalyzer, NumericRangeQuery, BooleanQuery, \
     BlockJoinQuery, BlockJoinCollector, TermsFilter, \
     HashSet, BooleanClause, Term, CharTermAttribute, \
-    PhraseQuery, StringReader
+    PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, BlockJoinQuery, \
+    FuzzyQuery, FuzzyTermEnum, Sort, Integer, \
+    SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
+    initVM, CLASSPATH, JArray, JavaError
     # KeywordAnalyzer
     # KeywordAnalyzer
+
+# Initialize jvm
+JVM = initVM(classpath=CLASSPATH, maxheap=str(400*1024*1024))
+
 import sys
 import os
 import sys
 import os
+import re
 import errno
 from librarian import dcparser
 from librarian.parser import WLDocument
 import catalogue.models
 from multiprocessing.pool import ThreadPool
 import errno
 from librarian import dcparser
 from librarian.parser import WLDocument
 import catalogue.models
 from multiprocessing.pool import ThreadPool
+from threading import current_thread
 import atexit
 import atexit
+import traceback
 
 
 class WLAnalyzer(PerFieldAnalyzerWrapper):
 
 
 class WLAnalyzer(PerFieldAnalyzerWrapper):
@@ -38,7 +50,9 @@ class WLAnalyzer(PerFieldAnalyzerWrapper):
         self.addAnalyzer("author", simple)
         self.addAnalyzer("is_book", keyword)
 
         self.addAnalyzer("author", simple)
         self.addAnalyzer("is_book", keyword)
 
-        #self.addanalyzer("fragment_anchor", keyword)
+        self.addAnalyzer("KEYWORD", keyword)
+        self.addAnalyzer("SIMPLE", simple)
+        self.addAnalyzer("POLISH", polish)
 
 
 class IndexStore(object):
 
 
 class IndexStore(object):
@@ -55,6 +69,51 @@ class IndexStore(object):
             else: raise
 
 
             else: raise
 
 
+class IndexChecker(IndexStore):
+    def __init__(self):
+        IndexStore.__init__(self)
+
+    def check(self):
+        checker = CheckIndex(self.store)
+        status = checker.checkIndex()
+        return status
+
+
+class Snippets(object):
+    SNIPPET_DIR = "snippets"
+
+    def __init__(self, book_id):
+        try:
+            os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
+        except OSError as exc:
+            if exc.errno == errno.EEXIST:
+                pass
+            else: raise
+        self.book_id = book_id
+        self.file = None
+
+    def open(self, mode='r'):
+        if not 'b' in mode:
+            mode += 'b'
+        self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
+        self.position = 0
+        return self
+
+    def add(self, snippet):
+        l = len(snippet)
+        self.file.write(snippet.encode('utf-8'))
+        pos = (self.position, l)
+        self.position += l
+        return pos
+
+    def get(self, pos):
+        self.file.seek(pos[0], 0)
+        return self.read(pos[1]).decode('utf-8')
+
+    def close(self):
+        self.file.close()
+
+
 class Index(IndexStore):
     def __init__(self, analyzer=None):
         IndexStore.__init__(self)
 class Index(IndexStore):
     def __init__(self, analyzer=None):
         IndexStore.__init__(self)
@@ -70,13 +129,20 @@ class Index(IndexStore):
                                  IndexWriter.MaxFieldLength.LIMITED)
         return self.index
 
                                  IndexWriter.MaxFieldLength.LIMITED)
         return self.index
 
-    def close(self):
+    def optimize(self):
         self.index.optimize()
         self.index.optimize()
+
+    def close(self):
+        try:
+            self.index.optimize()
+        except JavaError, je:
+            print "Error during optimize phase, check index: %s" % je
+
         self.index.close()
         self.index = None
 
     def remove_book(self, book):
         self.index.close()
         self.index = None
 
     def remove_book(self, book):
-        q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True,True)
+        q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
         self.index.deleteDocuments(q)
 
     def index_book(self, book, overwrite=True):
         self.index.deleteDocuments(q)
 
     def index_book(self, book, overwrite=True):
@@ -86,11 +152,13 @@ class Index(IndexStore):
         doc = self.extract_metadata(book)
         parts = self.extract_content(book)
         block = ArrayList().of_(Document)
         doc = self.extract_metadata(book)
         parts = self.extract_content(book)
         block = ArrayList().of_(Document)
-
+        
+        print "adding block."
         for p in parts:
             block.add(p)
         block.add(doc)
         self.index.addDocuments(block)
         for p in parts:
             block.add(p)
         block.add(doc)
         self.index.addDocuments(block)
+        print "added."
 
     master_tags = [
         'opowiadanie',
 
     master_tags = [
         'opowiadanie',
@@ -101,7 +169,7 @@ class Index(IndexStore):
         'wywiad'
         ]
 
         'wywiad'
         ]
 
-    skip_header_tags = ['autor_utworu', 'nazwa_utworu']
+    skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
 
     def create_book_doc(self, book):
         """
 
     def create_book_doc(self, book):
         """
@@ -116,6 +184,8 @@ class Index(IndexStore):
     def extract_metadata(self, book):
         book_info = dcparser.parse(book.xml_file)
 
     def extract_metadata(self, book):
         book_info = dcparser.parse(book.xml_file)
 
+        print("extract metadata for book %s id=%d, thread%d" % (book.slug, book.id, current_thread().ident))
+
         doc = self.create_book_doc(book)
         doc.add(Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS))
         doc.add(Field("tags", ','.join([t.name for t in book.tags]), Field.Store.NO, Field.Index.ANALYZED))
         doc = self.create_book_doc(book)
         doc.add(Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS))
         doc.add(Field("tags", ','.join([t.name for t in book.tags]), Field.Store.NO, Field.Index.ANALYZED))
@@ -153,29 +223,13 @@ class Index(IndexStore):
             if master.tag in self.master_tags:
                 return master
 
             if master.tag in self.master_tags:
                 return master
 
-    
     def extract_content(self, book):
         wld = WLDocument.from_file(book.xml_file.path)
         root = wld.edoc.getroot()
 
     def extract_content(self, book):
         wld = WLDocument.from_file(book.xml_file.path)
         root = wld.edoc.getroot()
 
-        # first we build a sequence of top-level items.
-        # book_id
-        # header_index - the 0-indexed position of header element.
-        # content
         master = self.get_master(root)
         if master is None:
             return []
         master = self.get_master(root)
         if master is None:
             return []
-        
-        header_docs = []
-        for header, position in zip(list(master), range(len(master))):
-            if header.tag in self.skip_header_tags:
-                continue
-            doc = self.create_book_doc(book)
-            doc.add(NumericField("header_index", Field.Store.YES, True).setIntValue(position))
-            doc.add(Field("header_type", header.tag, Field.Store.YES, Field.Index.NOT_ANALYZED))
-            content = u' '.join([t for t in header.itertext()])
-            doc.add(Field("content", content, Field.Store.NO, Field.Index.ANALYZED))
-            header_docs.append(doc)
 
         def walker(node):
             yield node, None
 
         def walker(node):
             yield node, None
@@ -185,50 +239,91 @@ class Index(IndexStore):
             yield None, node
             return
 
             yield None, node
             return
 
+        def fix_format(text):
+            return re.sub("/$", "", text, flags=re.M)
+
+        # header_type
+        # header_index
+        header_docs = []
         # Then we create a document for each fragments
         # fragment_anchor - the anchor
         # themes - list of themes [not indexed]
         fragment_docs = []
         # will contain (framgent id -> { content: [], themes: [] }
         fragments = {}
         # Then we create a document for each fragments
         # fragment_anchor - the anchor
         # themes - list of themes [not indexed]
         fragment_docs = []
         # will contain (framgent id -> { content: [], themes: [] }
         fragments = {}
-        for start, end in walker(master):
-            if start is not None and start.tag == 'begin':
-                fid = start.attrib['id'][1:]
-                fragments[fid] = {'content': [], 'themes': []}
-                fragments[fid]['content'].append(start.tail)
-            elif start is not None and start.tag == 'motyw':
-                fid = start.attrib['id'][1:]
-                fragments[fid]['themes'].append(start.text)
-                fragments[fid]['content'].append(start.tail)
-            elif start is not None and start.tag == 'end':
-                fid = start.attrib['id'][1:]
-                if fid not in fragments:
-                    continue  # a broken <end> node, skip it
-                frag = fragments[fid]
-                del fragments[fid]
-
-                def jstr(l):
-                    return u' '.join(map(
-                        lambda x: x == None and u'(none)' or unicode(x),
-                        l))
+        snippets = Snippets(book.id).open('w')
+        try:
+            for header, position in zip(list(master), range(len(master))):
+                sys.stdout.write("\rsection: %d" % position)
+
+                if header.tag in self.skip_header_tags:
+                    continue
 
                 doc = self.create_book_doc(book)
 
                 doc = self.create_book_doc(book)
-                doc.add(Field("fragment_anchor", fid,
-                              Field.Store.YES, Field.Index.NOT_ANALYZED))
-                doc.add(Field("content",
-                              u' '.join(filter(lambda s: s is not None, frag['content'])),
-                              Field.Store.NO, Field.Index.ANALYZED))
-                doc.add(Field("themes",
-                              u' '.join(filter(lambda s: s is not None, frag['themes'])),
-                              Field.Store.NO, Field.Index.ANALYZED))
-
-                fragment_docs.append(doc)
-            elif start is not None:
-                for frag in fragments.values():
-                    frag['content'].append(start.text)
-            elif end is not None:
-                for frag in fragments.values():
-                    frag['content'].append(end.tail)
+
+                doc.add(NumericField("header_index", Field.Store.YES, True).setIntValue(position))
+                doc.add(Field("header_type", header.tag, Field.Store.YES, Field.Index.NOT_ANALYZED))
+
+                content = u' '.join([t for t in header.itertext()])
+                content = fix_format(content)
+
+                doc.add(Field("content", content, Field.Store.NO, Field.Index.ANALYZED))
+                snip_pos = snippets.add(content)
+                doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
+                doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[0]))
+
+                header_docs.append(doc)
+
+                for start, end in walker(master):
+                    if start is not None and start.tag == 'begin':
+                        fid = start.attrib['id'][1:]
+                        fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
+                        fragments[fid]['content'].append(start.tail)
+                    elif start is not None and start.tag == 'motyw':
+                        fid = start.attrib['id'][1:]
+                        fragments[fid]['themes'].append(start.text)
+                        fragments[fid]['content'].append(start.tail)
+                    elif start is not None and start.tag == 'end':
+                        fid = start.attrib['id'][1:]
+                        if fid not in fragments:
+                            continue  # a broken <end> node, skip it
+                        frag = fragments[fid]
+                        del fragments[fid]
+
+                        def jstr(l):
+                            return u' '.join(map(
+                                lambda x: x == None and u'(none)' or unicode(x),
+                                l))
+
+                        doc = self.create_book_doc(book)
+
+                        doc.add(NumericField("header_index", Field.Store.YES, True).setIntValue(position))
+                        doc.add(NumericField("header_span", Field.Store.YES, True).setIntValue(position - frag['start_section'] + 1))
+                        doc.add(Field("header_type", frag['start_header'], Field.Store.YES, Field.Index.NOT_ANALYZED))
+
+                        doc.add(Field("fragment_anchor", fid,
+                                      Field.Store.YES, Field.Index.NOT_ANALYZED))
+                        doc.add(Field("content",
+                                      u' '.join(filter(lambda s: s is not None, frag['content'])),
+                                      Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
+
+                        snip_pos = snippets.add(content)
+                        doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
+                        doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[0]))
+
+                        doc.add(Field("themes",
+                                      u' '.join(filter(lambda s: s is not None, frag['themes'])),
+                                      Field.Store.NO, Field.Index.ANALYZED))
+
+                        fragment_docs.append(doc)
+                    elif start is not None:
+                        for frag in fragments.values():
+                            frag['content'].append(start.text)
+                    elif end is not None:
+                        for frag in fragments.values():
+                            frag['content'].append(end.tail)
+        finally:
+            snippets.close()
 
         return header_docs + fragment_docs
 
 
         return header_docs + fragment_docs
 
@@ -240,6 +335,17 @@ class Index(IndexStore):
         self.close()
 
 
         self.close()
 
 
+def log_exception_wrapper(f):
+    def _wrap(*a):
+        try:
+            f(*a)
+        except Exception, e:
+            print("Error in indexing thread: %s" % e)
+            traceback.print_exc()
+            raise e
+    return _wrap
+
+
 class ReusableIndex(Index):
     """
     Works like index, but does not close/optimize Lucene index
 class ReusableIndex(Index):
     """
     Works like index, but does not close/optimize Lucene index
@@ -256,26 +362,26 @@ class ReusableIndex(Index):
         if ReusableIndex.index is not None:
             self.index = ReusableIndex.index
         else:
         if ReusableIndex.index is not None:
             self.index = ReusableIndex.index
         else:
-            ReusableIndex.pool = ThreadPool(threads)
+            print("opening index")
+            ReusableIndex.pool = ThreadPool(threads, initializer=lambda: JVM.attachCurrentThread() )
             ReusableIndex.pool_jobs = []
             Index.open(self, analyzer)
             ReusableIndex.index = self.index
             atexit.register(ReusableIndex.close_reusable)
 
     def index_book(self, *args, **kw):
             ReusableIndex.pool_jobs = []
             Index.open(self, analyzer)
             ReusableIndex.index = self.index
             atexit.register(ReusableIndex.close_reusable)
 
     def index_book(self, *args, **kw):
-        job = ReusableIndex.pool.apply_async(Index.index_book, args, kw)
+        job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
         ReusableIndex.pool_jobs.append(job)
 
     @staticmethod
     def close_reusable():
         if ReusableIndex.index is not None:
         ReusableIndex.pool_jobs.append(job)
 
     @staticmethod
     def close_reusable():
         if ReusableIndex.index is not None:
-            all_jobs = len(ReusableIndex.pool_jobs)
-            waited=1
+            print("wait for indexing to finish")
             for job in ReusableIndex.pool_jobs:
             for job in ReusableIndex.pool_jobs:
-                sys.stdout.write("\rWaiting for search index job: %d/%d..." % 
-                job.wait()
-                waited+=1
-            print("Indexing done.")
+                job.get()
+                sys.stdout.write('.')
+                sys.stdout.flush()
+            print("done.")
             ReusableIndex.pool.close()
 
             ReusableIndex.index.optimize()
             ReusableIndex.pool.close()
 
             ReusableIndex.index.optimize()
@@ -289,7 +395,7 @@ class ReusableIndex(Index):
 class Search(IndexStore):
     def __init__(self, default_field="content"):
         IndexStore.__init__(self)
 class Search(IndexStore):
     def __init__(self, default_field="content"):
         IndexStore.__init__(self)
-        self.analyzer = PolishAnalyzer(Version.LUCENE_34)
+        self.analyzer = WLAnalyzer() #PolishAnalyzer(Version.LUCENE_34)
         ## self.analyzer = WLAnalyzer()
         self.searcher = IndexSearcher(self.store, True)
         self.parser = QueryParser(Version.LUCENE_34, default_field,
         ## self.analyzer = WLAnalyzer()
         self.searcher = IndexSearcher(self.store, True)
         self.parser = QueryParser(Version.LUCENE_34, default_field,
@@ -368,42 +474,224 @@ class Search(IndexStore):
 # }
 
 
 # }
 
 
+class SearchResult(object):
+    def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets_cb=None):
+        if score:
+            self.score = score
+        else:
+            self.score = scoreDocs.score
+
+        self.hits = []
+
+        stored = searcher.doc(scoreDocs.doc)
+        self.book_id = int(stored.get("book_id"))
+
+        header_type = stored.get("header_type")
+        sec = (header_type, int(stored.get("header_index")))
+        header_span = stored.get('header_span')
+        header_span = header_span is not None and int(header_span) or 1
+        stored = searcher.doc(scoreDocs.doc)
+        self.book_id = int(stored.get("book_id"))
+
+        fragment = stored.get("fragment_anchor")
+
+        hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets_cb': snippets_cb})
+
+        self.hits.append(hit)
+
+    def merge(self, other):
+        if self.book_id != other.book_id:
+            raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
+        self.hits += other.hits
+        if other.score > self.score:
+            self.score = other.score
+        return self
+
+    def add_snippets(self, snippets):
+        self.snippets += snippets
+        return self
+
+    def get_book(self):
+        return catalogue.models.Book.objects.get(id=self.book_id)
+
+    book = property(get_book)
+
+    def get_parts(self):
+        book = self.book
+
+        def sections_covered(results):
+            frags = filter(lambda r: r[1] is not None, results)
+            sect = filter(lambda r: r[1] is None, results)
+            sect = filter(lambda s: 0 == len(filter(
+                lambda f: s[0][1] >= f[0][1] and s[0][1] < f[0][1] + f[0][2],
+                frags)), sect)
+            print "filtered, non overlapped sections: %s" % sect
+            return frags + sect
+            
+            
+        parts = [{"header": s[0], "position": s[1], '_score_key': s} for s in self.sections] \
+            + [{"fragment": book.fragments.get(anchor=f), '_score_key':f} for f in self.fragments]
+
+        parts.sort(lambda a, b: cmp(self.scores[a['_score_key']], self.scores[b['_score_key']]))
+        print("bookid: %d parts: %s" % (self.book_id, parts))
+        return parts
+
+    parts = property(get_parts)
+
+
+    def __unicode__(self):
+        return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
+
+    @staticmethod
+    def aggregate(*result_lists):
+        books = {}
+        for rl in result_lists:
+            for r in rl:
+                if r.book_id in books:
+                    books[r.book_id].merge(r)
+                    #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
+                else:
+                    books[r.book_id] = r
+        return books.values()
+
+    def __cmp__(self, other):
+        return cmp(self.score, other.score)
+
+
 class MultiSearch(Search):
     """Class capable of IMDb-like searching"""
 class MultiSearch(Search):
     """Class capable of IMDb-like searching"""
-    def get_tokens(self, queryreader):
-        if isinstance(queryreader, str):
-            queryreader = StringReader(queryreader)
-        queryreader.reset()
-        tokens = self.analyzer.reusableTokenStream('content', queryreader)
+    def get_tokens(self, searched, field='content'):
+        """returns tokens analyzed by a proper (for a field) analyzer
+        argument can be: StringReader, string/unicode, or tokens. In the last case
+        they will just be returned (so we can reuse tokens, if we don't change the analyzer)
+        """
+        if isinstance(searched, str) or isinstance(searched, unicode):
+            searched = StringReader(searched)
+        elif isinstance(searched, list):
+            return searched
+
+        searched.reset()
+        tokens = self.analyzer.reusableTokenStream(field, searched)
         toks = []
         while tokens.incrementToken():
             cta = tokens.getAttribute(CharTermAttribute.class_)
         toks = []
         while tokens.incrementToken():
             cta = tokens.getAttribute(CharTermAttribute.class_)
-            toks.append(cta)
+            toks.append(cta.toString())
         return toks
 
         return toks
 
-    def make_phrase(self, tokens, field='content', joined=False):
-        phrase = PhraseQuery()
-        for t in tokens:
-            term = Term(field, t)
-            phrase.add(term)
-        if joined:
-            phrase = self.content_query(phrase)
+    def fuzziness(self, fuzzy):
+        if not fuzzy:
+            return None
+        if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
+            return fuzzy
+        else:
+            return 0.5
+
+    def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
+        if fuzzy:
+            phrase = MultiPhraseQuery()
+            for t in tokens:
+                term = Term(field, t)
+                fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
+                fuzzterms = []
+
+                while True:
+                    #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
+                    ft = fuzzterm.term()
+                    if ft:
+                        fuzzterms.append(ft)
+                    if not fuzzterm.next(): break
+                if fuzzterms:
+                    phrase.add(JArray('object')(fuzzterms, Term))
+                else:
+                    phrase.add(term)
+        else:
+            phrase = PhraseQuery()
+            phrase.setSlop(slop)
+            for t in tokens:
+                term = Term(field, t)
+                phrase.add(term)
         return phrase
 
         return phrase
 
-    def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, joined=False):
+    def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
         q = BooleanQuery()
         for t in tokens:
             term = Term(field, t)
         q = BooleanQuery()
         for t in tokens:
             term = Term(field, t)
+            if fuzzy:
+                term = FuzzyQuery(term, self.fuzziness(fuzzy))
+            else:
+                term = TermQuery(term)
             q.add(BooleanClause(term, modal))
             q.add(BooleanClause(term, modal))
-        if joined:
-            self.content_query(q)
         return q
 
     def content_query(self, query):
         return BlockJoinQuery(query, self.parent_filter,
                               BlockJoinQuery.ScoreMode.Total)
 
         return q
 
     def content_query(self, query):
         return BlockJoinQuery(query, self.parent_filter,
                               BlockJoinQuery.ScoreMode.Total)
 
-    def multiseach(self, query, max_results=50):
+    def search_perfect_book(self, searched, max_results=20, fuzzy=False):
+        qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in ['author', 'title']]
+
+        books = []
+        for q in qrys:
+            top = self.searcher.search(q, max_results)
+            for found in top.scoreDocs:
+                books.append(SearchResult(self.searcher, found))
+        return books
+
+    def search_perfect_parts(self, searched, max_results=20, fuzzy=False):
+        qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
+
+        books = []
+        for q in qrys:
+            top = self.searcher.search(q, max_results)
+            for found in top.scoreDocs:
+                books.append(SearchResult(self.searcher, found).add_snippets(self.get_snippets(found, q)))
+
+        return books
+
+    def search_everywhere(self, searched, max_results=20, fuzzy=False):
+        books = []
+
+        # content only query : themes x content
+        q = BooleanQuery()
+
+        tokens = self.get_tokens(searched)
+        q.add(BooleanClause(self.make_term_query(tokens, field='themes', fuzzy=fuzzy), BooleanClause.Occur.MUST))
+        q.add(BooleanClause(self.make_term_query(tokens, field='content', fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
+
+        topDocs = self.searcher.search(q, max_results)
+        for found in topDocs.scoreDocs:
+            books.append(SearchResult(self.searcher, found))
+
+        # joined query themes/content x author/title/epochs/genres/kinds
+        q = BooleanQuery()
+        in_meta = BooleanQuery()
+        in_content = BooleanQuery()
+
+        for fld in ['themes', 'content']:
+            in_content.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
+
+        in_meta.add(BooleanClause(self.make_term_query(
+            self.get_tokens(searched, field='author'), field='author', fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
+
+        for fld in ['title', 'epochs', 'genres', 'kinds']:
+            in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
+
+        q.add(BooleanClause(in_meta, BooleanClause.Occur.MUST))
+        in_content_join = self.content_query(in_content)
+        q.add(BooleanClause(in_content_join, BooleanClause.Occur.MUST))
+        #        import pdb; pdb.set_trace()
+        collector = BlockJoinCollector(Sort.RELEVANCE, 100, True, True)
+
+        self.searcher.search(q, collector)
+
+        top_groups = collector.getTopGroups(in_content_join, Sort.RELEVANCE, 0, max_results, 0, True)
+        if top_groups:
+            for grp in top_groups.groups:
+                for part in grp.scoreDocs:
+                    books.append(SearchResult(self.searcher, part, score=grp.maxScore))
+        return books
+
+    def multisearch(self, query, max_results=50):
         """
         Search strategy:
         - (phrase) OR -> content
         """
         Search strategy:
         - (phrase) OR -> content
@@ -414,35 +702,65 @@ class MultiSearch(Search):
                       -> tags
                       -> content
         """
                       -> tags
                       -> content
         """
-        queryreader = StringReader(query)
-        tokens = self.get_tokens(queryreader)
+        # queryreader = StringReader(query)
+        # tokens = self.get_tokens(queryreader)
+
+        # top_level = BooleanQuery()
+        # Should = BooleanClause.Occur.SHOULD
 
 
-        top_level = BooleanQuery()
-        Should = BooleanClause.Occur.SHOULD
+        # phrase_level = BooleanQuery()
+        # phrase_level.setBoost(1.3)
 
 
-        phrase_level = BooleanQuery()
+        # p_content = self.make_phrase(tokens, joined=True)
+        # p_title = self.make_phrase(tokens, 'title')
+        # p_author = self.make_phrase(tokens, 'author')
 
 
-        p_content = self.make_phrase(tokens, joined=True)
-        p_title = self.make_phrase(tokens, 'title')
-        p_author = self.make_phrase(tokens, 'author')
+        # phrase_level.add(BooleanClause(p_content, Should))
+        # phrase_level.add(BooleanClause(p_title, Should))
+        # phrase_level.add(BooleanClause(p_author, Should))
 
 
-        phrase_level.add(BooleanClause(p_content, Should))
-        phrase_level.add(BooleanClause(p_title, Should))
-        phrase_level.add(BooleanClause(p_author, Should))
+        # kw_level = BooleanQuery()
 
 
-        kw_level = BooleanQuery()
+        # kw_level.add(self.make_term_query(tokens, 'author'), Should)
+        # j_themes = self.make_term_query(tokens, 'themes', joined=True)
+        # kw_level.add(j_themes, Should)
+        # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
+        # j_con = self.make_term_query(tokens, joined=True)
+        # kw_level.add(j_con, Should)
 
 
-        kw_level.add(self.make_term_query(tokens, 'author'), Should)
-        kw_level.add(self.make_term_query(tokens, 'themes', joined=True), Should)
-        kw_level.add(self.make_term_query(tokens, 'tags'), Should)
-        kw_level.add(self.make_term_query(tokens, joined=True), Should)
+        # top_level.add(BooleanClause(phrase_level, Should))
+        # top_level.add(BooleanClause(kw_level, Should))
 
 
-        top_level.add(BooleanClause(phrase_level, Should))
-        top_level.add(BooleanClause(kw_level, Should))
+        return None
+
+    def do_search(self, query, max_results=50, collector=None):
+        tops = self.searcher.search(query, max_results)
+        #tops = self.searcher.search(p_content, max_results)
 
 
-        tops = self.searcher.search(top_level, max_results)
         bks = []
         for found in tops.scoreDocs:
             doc = self.searcher.doc(found.doc)
         bks = []
         for found in tops.scoreDocs:
             doc = self.searcher.doc(found.doc)
-            bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
+            b = catalogue.models.Book.objects.get(id=doc.get("book_id"))
+            bks.append(b)
+            print "%s (%d) -> %f" % (b, b.id, found.score)
         return (bks, tops.totalHits)
         return (bks, tops.totalHits)
+
+    def get_snippets(self, scoreDoc, query, field='content'):
+        htmlFormatter = SimpleHTMLFormatter()
+        highlighter = Highlighter(htmlFormatter, QueryScorer(query))
+
+        stored = self.searcher.doc(scoreDoc.doc)
+
+        # locate content.
+        snippets = Snippets(stored.get('book_id')).open()
+        try:
+            text = snippets.get(stored.get('snippets_position'), stored.get('snippets_length'))
+        finally:
+            snippets.close()
+
+        tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
+        #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
+        snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
+        print('snips: %s' % snip)
+
+        return [snip]