store contents in index for highlighting fragments
[wolnelektury.git] / apps / search / index.py
index d539257..36f4247 100644 (file)
@@ -7,14 +7,17 @@ from lucene import SimpleFSDirectory, IndexWriter, File, Field, \
     KeywordAnalyzer, NumericRangeQuery, BooleanQuery, \
     BlockJoinQuery, BlockJoinCollector, TermsFilter, \
     HashSet, BooleanClause, Term, CharTermAttribute, \
     KeywordAnalyzer, NumericRangeQuery, BooleanQuery, \
     BlockJoinQuery, BlockJoinCollector, TermsFilter, \
     HashSet, BooleanClause, Term, CharTermAttribute, \
-    PhraseQuery, StringReader
+    PhraseQuery, StringReader, TermQuery, BlockJoinQuery, \
+    Sort, Integer
     # KeywordAnalyzer
     # KeywordAnalyzer
+import sys
 import os
 import errno
 from librarian import dcparser
 from librarian.parser import WLDocument
 import catalogue.models
 from multiprocessing.pool import ThreadPool
 import os
 import errno
 from librarian import dcparser
 from librarian.parser import WLDocument
 import catalogue.models
 from multiprocessing.pool import ThreadPool
+from threading import current_thread
 import atexit
 
 
 import atexit
 
 
@@ -82,6 +85,7 @@ class Index(IndexStore):
         if overwrite:
             self.remove_book(book)
 
         if overwrite:
             self.remove_book(book)
 
+
         doc = self.extract_metadata(book)
         parts = self.extract_content(book)
         block = ArrayList().of_(Document)
         doc = self.extract_metadata(book)
         parts = self.extract_content(book)
         block = ArrayList().of_(Document)
@@ -100,7 +104,7 @@ class Index(IndexStore):
         'wywiad'
         ]
 
         'wywiad'
         ]
 
-    skip_header_tags = ['autor_utworu', 'nazwa_utworu']
+    skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
 
     def create_book_doc(self, book):
         """
 
     def create_book_doc(self, book):
         """
@@ -115,6 +119,8 @@ class Index(IndexStore):
     def extract_metadata(self, book):
         book_info = dcparser.parse(book.xml_file)
 
     def extract_metadata(self, book):
         book_info = dcparser.parse(book.xml_file)
 
+        print("extract metadata for book %s id=%d, thread%d" % (book.slug, book.id, current_thread().ident))
+        
         doc = self.create_book_doc(book)
         doc.add(Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS))
         doc.add(Field("tags", ','.join([t.name for t in book.tags]), Field.Store.NO, Field.Index.ANALYZED))
         doc = self.create_book_doc(book)
         doc.add(Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS))
         doc.add(Field("tags", ','.join([t.name for t in book.tags]), Field.Store.NO, Field.Index.ANALYZED))
@@ -173,7 +179,7 @@ class Index(IndexStore):
             doc.add(NumericField("header_index", Field.Store.YES, True).setIntValue(position))
             doc.add(Field("header_type", header.tag, Field.Store.YES, Field.Index.NOT_ANALYZED))
             content = u' '.join([t for t in header.itertext()])
             doc.add(NumericField("header_index", Field.Store.YES, True).setIntValue(position))
             doc.add(Field("header_type", header.tag, Field.Store.YES, Field.Index.NOT_ANALYZED))
             content = u' '.join([t for t in header.itertext()])
-            doc.add(Field("content", content, Field.Store.NO, Field.Index.ANALYZED))
+            doc.add(Field("content", content, Field.Store.YES, Field.Index.ANALYZED))
             header_docs.append(doc)
 
         def walker(node):
             header_docs.append(doc)
 
         def walker(node):
@@ -216,7 +222,7 @@ class Index(IndexStore):
                               Field.Store.YES, Field.Index.NOT_ANALYZED))
                 doc.add(Field("content",
                               u' '.join(filter(lambda s: s is not None, frag['content'])),
                               Field.Store.YES, Field.Index.NOT_ANALYZED))
                 doc.add(Field("content",
                               u' '.join(filter(lambda s: s is not None, frag['content'])),
-                              Field.Store.NO, Field.Index.ANALYZED))
+                              Field.Store.YES, Field.Index.ANALYZED))
                 doc.add(Field("themes",
                               u' '.join(filter(lambda s: s is not None, frag['themes'])),
                               Field.Store.NO, Field.Index.ANALYZED))
                 doc.add(Field("themes",
                               u' '.join(filter(lambda s: s is not None, frag['themes'])),
                               Field.Store.NO, Field.Index.ANALYZED))
@@ -255,6 +261,7 @@ class ReusableIndex(Index):
         if ReusableIndex.index is not None:
             self.index = ReusableIndex.index
         else:
         if ReusableIndex.index is not None:
             self.index = ReusableIndex.index
         else:
+            print("opening index")
             ReusableIndex.pool = ThreadPool(threads)
             ReusableIndex.pool_jobs = []
             Index.open(self, analyzer)
             ReusableIndex.pool = ThreadPool(threads)
             ReusableIndex.pool_jobs = []
             Index.open(self, analyzer)
@@ -262,12 +269,13 @@ class ReusableIndex(Index):
             atexit.register(ReusableIndex.close_reusable)
 
     def index_book(self, *args, **kw):
             atexit.register(ReusableIndex.close_reusable)
 
     def index_book(self, *args, **kw):
-        job = ReusableIndex.pool.apply_async(Index.index_book, args, kw)
+        job = ReusableIndex.pool.apply_async(Index.index_book, (self,)+ args, kw)
         ReusableIndex.pool_jobs.append(job)
 
     @staticmethod
     def close_reusable():
         if ReusableIndex.index is not None:
         ReusableIndex.pool_jobs.append(job)
 
     @staticmethod
     def close_reusable():
         if ReusableIndex.index is not None:
+            print("closing index")
             for job in ReusableIndex.pool_jobs:
                 job.wait()
             ReusableIndex.pool.close()
             for job in ReusableIndex.pool_jobs:
                 job.wait()
             ReusableIndex.pool.close()
@@ -365,39 +373,89 @@ class Search(IndexStore):
 class MultiSearch(Search):
     """Class capable of IMDb-like searching"""
     def get_tokens(self, queryreader):
 class MultiSearch(Search):
     """Class capable of IMDb-like searching"""
     def get_tokens(self, queryreader):
-        if isinstance(queryreader, str):
+        if isinstance(queryreader, str) or isinstance(queryreader, unicode):
             queryreader = StringReader(queryreader)
         queryreader.reset()
         tokens = self.analyzer.reusableTokenStream('content', queryreader)
         toks = []
         while tokens.incrementToken():
             cta = tokens.getAttribute(CharTermAttribute.class_)
             queryreader = StringReader(queryreader)
         queryreader.reset()
         tokens = self.analyzer.reusableTokenStream('content', queryreader)
         toks = []
         while tokens.incrementToken():
             cta = tokens.getAttribute(CharTermAttribute.class_)
-            toks.append(cta)
+            toks.append(cta.toString())
         return toks
 
         return toks
 
-    def make_phrase(self, tokens, field='content', joined=False):
+    def make_phrase(self, tokens, field='content', slop=2):
         phrase = PhraseQuery()
         phrase = PhraseQuery()
+        phrase.setSlop(slop)
         for t in tokens:
             term = Term(field, t)
             phrase.add(term)
         for t in tokens:
             term = Term(field, t)
             phrase.add(term)
-        if joined:
-            phrase = self.content_query(phrase)
         return phrase
 
         return phrase
 
-    def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, joined=False):
+    def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD):
         q = BooleanQuery()
         for t in tokens:
             term = Term(field, t)
         q = BooleanQuery()
         for t in tokens:
             term = Term(field, t)
-            q.add(BooleanClause(term, modal))
-        if joined:
-            self.content_query(q)
+            q.add(BooleanClause(TermQuery(term), modal))
         return q
 
     def content_query(self, query):
         return BlockJoinQuery(query, self.parent_filter,
                               BlockJoinQuery.ScoreMode.Total)
 
         return q
 
     def content_query(self, query):
         return BlockJoinQuery(query, self.parent_filter,
                               BlockJoinQuery.ScoreMode.Total)
 
-    def multiseach(self, query, max_results=50):
+    def search_perfect(self, tokens, max_results=20):
+        qrys = [self.make_phrase(tokens, field=fld) for fld in ['author', 'title', 'content']]
+
+        books = []
+        for q in qrys:
+            top = self.searcher.search(q, max_results)
+            for found in top.scoreDocs:
+                book_info = self.searcher.doc(found.doc)
+                books.append((found.score, catalogue.models.Book.objects.get(id=book_info.get("book_id")), []))
+        return books
+
+    def search_everywhere(self, tokens, max_results=20):
+        q = BooleanQuery()
+        in_meta = BooleanQuery()
+        in_content = BooleanQuery()
+
+        for fld in ['themes', 'content']:
+            in_content.add(BooleanClause(self.make_term_query(tokens, field=fld), BooleanClause.Occur.SHOULD))
+
+        for fld in ['author', 'title', 'epochs', 'genres', 'kinds']:
+            in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld), BooleanClause.Occur.SHOULD))
+
+        q.add(BooleanClause(in_meta, BooleanClause.Occur.MUST))
+        in_content_join = self.content_query(in_content)
+        q.add(BooleanClause(in_content_join, BooleanClause.Occur.MUST))
+
+        collector = BlockJoinCollector(Sort.RELEVANCE, 100, True, True)
+
+        self.searcher.search(q, collector)
+
+        books = []
+
+        top_groups = collector.getTopGroups(in_content_join, Sort.RELEVANCE, 0, max_results, 0, True)
+        if top_groups:
+            for grp in top_groups.groups:
+                doc_id = Integer.cast_(grp.groupValue).intValue()
+                book_data = self.searcher.doc(doc_id)
+                book = catalogue.models.Book.objects.get(id=book_data.get("book_id"))
+                parts = []
+                for part in grp.scoreDocs:
+                    part_data = self.searcher.doc(part.doc)
+                    header_type = part_data.get("header_type")
+                    if header_type:
+                        parts.append((part.score, {"header": header_type, "position": int(part_data.get("header_index"))}))
+                    fragment = part_data.get("fragment_anchor")
+                    if fragment:
+                        fragment = book.fragments.get(anchor=fragment)
+                        parts.append((part.score, {"fragment": fragment}))
+                books.append((grp.maxScore, book, parts))
+                
+        return books
+
+
+    def multisearch(self, query, max_results=50):
         """
         Search strategy:
         - (phrase) OR -> content
         """
         Search strategy:
         - (phrase) OR -> content
@@ -408,35 +466,46 @@ class MultiSearch(Search):
                       -> tags
                       -> content
         """
                       -> tags
                       -> content
         """
-        queryreader = StringReader(query)
-        tokens = self.get_tokens(queryreader)
+        queryreader = StringReader(query)
+        tokens = self.get_tokens(queryreader)
 
 
-        top_level = BooleanQuery()
-        Should = BooleanClause.Occur.SHOULD
+        top_level = BooleanQuery()
+        Should = BooleanClause.Occur.SHOULD
 
 
-        phrase_level = BooleanQuery()
+        # phrase_level = BooleanQuery()
+        # phrase_level.setBoost(1.3)
 
 
-        p_content = self.make_phrase(tokens, joined=True)
-        p_title = self.make_phrase(tokens, 'title')
-        p_author = self.make_phrase(tokens, 'author')
+        p_content = self.make_phrase(tokens, joined=True)
+        p_title = self.make_phrase(tokens, 'title')
+        p_author = self.make_phrase(tokens, 'author')
 
 
-        phrase_level.add(BooleanClause(p_content, Should))
-        phrase_level.add(BooleanClause(p_title, Should))
-        phrase_level.add(BooleanClause(p_author, Should))
+        phrase_level.add(BooleanClause(p_content, Should))
+        phrase_level.add(BooleanClause(p_title, Should))
+        phrase_level.add(BooleanClause(p_author, Should))
 
 
-        kw_level = BooleanQuery()
+        kw_level = BooleanQuery()
 
 
-        kw_level.add(self.make_term_query(tokens, 'author'), Should)
-        kw_level.add(self.make_term_query(tokens, 'themes', joined=True), Should)
-        kw_level.add(self.make_term_query(tokens, 'tags'), Should)
-        kw_level.add(self.make_term_query(tokens, joined=True), Should)
+        # kw_level.add(self.make_term_query(tokens, 'author'), Should)
+        # j_themes = self.make_term_query(tokens, 'themes', joined=True)
+        # kw_level.add(j_themes, Should)
+        # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
+        # j_con = self.make_term_query(tokens, joined=True)
+        # kw_level.add(j_con, Should)
 
 
-        top_level.add(BooleanClause(phrase_level, Should))
-        top_level.add(BooleanClause(kw_level, Should))
+        # top_level.add(BooleanClause(phrase_level, Should))
+        # top_level.add(BooleanClause(kw_level, Should))
+
+        return None
+
+    
+    def do_search(self, query, max_results=50, collector=None):
+        tops = self.searcher.search(query, max_results)
+        #tops = self.searcher.search(p_content, max_results)
 
 
-        tops = self.searcher.search(top_level, max_results)
         bks = []
         for found in tops.scoreDocs:
             doc = self.searcher.doc(found.doc)
         bks = []
         for found in tops.scoreDocs:
             doc = self.searcher.doc(found.doc)
-            bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
+            b = catalogue.models.Book.objects.get(id=doc.get("book_id"))
+            bks.append(b)
+            print "%s (%d) -> %f" % (b, b.id, found.score)
         return (bks, tops.totalHits)
         return (bks, tops.totalHits)