merge
[wolnelektury.git] / apps / search / index.py
index 77ce877..33836ad 100644 (file)
@@ -25,6 +25,7 @@ import re
 import errno
 from librarian import dcparser
 from librarian.parser import WLDocument
 import errno
 from librarian import dcparser
 from librarian.parser import WLDocument
+from lxml import etree
 import catalogue.models
 from multiprocessing.pool import ThreadPool
 from threading import current_thread
 import catalogue.models
 from multiprocessing.pool import ThreadPool
 from threading import current_thread
@@ -54,6 +55,8 @@ class WLAnalyzer(PerFieldAnalyzerWrapper):
         self.addAnalyzer("source_name", simple)
         self.addAnalyzer("publisher", simple)
         self.addAnalyzer("authors", simple)
         self.addAnalyzer("source_name", simple)
         self.addAnalyzer("publisher", simple)
         self.addAnalyzer("authors", simple)
+        self.addAnalyzer("title", simple)
+
         self.addAnalyzer("is_book", keyword)
         # shouldn't the title have two forms? _pl and simple?
 
         self.addAnalyzer("is_book", keyword)
         # shouldn't the title have two forms? _pl and simple?
 
@@ -210,7 +213,7 @@ class Index(BaseIndex):
 
         for tag in catalogue.models.Tag.objects.all():
             doc = Document()
 
         for tag in catalogue.models.Tag.objects.all():
             doc = Document()
-            doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(tag.id))
+            doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
@@ -221,9 +224,9 @@ class Index(BaseIndex):
         Create a lucene document referring book id.
         """
         doc = Document()
         Create a lucene document referring book id.
         """
         doc = Document()
-        doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
+        doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
         if book.parent is not None:
         if book.parent is not None:
-            doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
+            doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
         return doc
 
     def remove_book(self, book):
         return doc
 
     def remove_book(self, book):
@@ -253,7 +256,7 @@ class Index(BaseIndex):
         self.index.addDocument(book_doc)
         del book_doc
 
         self.index.addDocument(book_doc)
         del book_doc
 
-        self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors']])
+        self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
 
     master_tags = [
         'opowiadanie',
 
     master_tags = [
         'opowiadanie',
@@ -261,11 +264,22 @@ class Index(BaseIndex):
         'dramat_wierszowany_l',
         'dramat_wierszowany_lp',
         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
         'dramat_wierszowany_l',
         'dramat_wierszowany_lp',
         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
-        'wywiad'
+        'wywiad',
+        ]
+
+    ignore_content_tags = [
+        'uwaga', 'extra',
+        'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
+        'didaskalia',
+        'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
         ]
 
         ]
 
+    footnote_tags = ['pa', 'pt', 'pr', 'pe']
+
     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
 
     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
 
+    published_date_re = re.compile("([0-9]+)[\]. ]*$")
+
     def extract_metadata(self, book, book_info=None):
         """
         Extract metadata from book and returns a map of fields keyed by fieldname
     def extract_metadata(self, book, book_info=None):
         """
         Extract metadata from book and returns a map of fields keyed by fieldname
@@ -306,6 +320,13 @@ class Index(BaseIndex):
                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 
                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 
+        # get published date
+        source = book_info.source_name
+        match = self.published_date_re.search(source)
+        print("published date is %s %s" % (match, match is not None and match.groups()))
+        if match is not None:
+            fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
+
         return fields
 
     def add_gaps(self, fields, fieldname):
         return fields
 
     def add_gaps(self, fields, fieldname):
@@ -338,15 +359,26 @@ class Index(BaseIndex):
         if master is None:
             return []
 
         if master is None:
             return []
 
-        def walker(node):
+        def walker(node, ignore_tags=[]):
             yield node, None
             yield node, None
-            for child in list(node):
+            for child in filter(lambda n: n.tag not in ignore_tags, list(node)):
                 for b, e in walker(child):
                     yield b, e
             yield None, node
             return
 
         def fix_format(text):
                 for b, e in walker(child):
                     yield b, e
             yield None, node
             return
 
         def fix_format(text):
+            #            separator = [u" ", u"\t", u".", u";", u","]
+            if isinstance(text, list):
+                # need to join it first
+                text = filter(lambda s: s is not None, content)
+                text = u' '.join(text)
+                # for i in range(len(text)):
+                #     if i > 0:
+                #         if text[i][0] not in separator\
+                #             and text[i - 1][-1] not in separator:
+                #          text.insert(i, u" ")
+
             return re.sub("(?m)/$", "", text)
 
         def add_part(snippets, **fields):
             return re.sub("(?m)/$", "", text)
 
         def add_part(snippets, **fields):
@@ -394,58 +426,78 @@ class Index(BaseIndex):
 
         fragments = {}
         snippets = Snippets(book.id).open('w')
 
         fragments = {}
         snippets = Snippets(book.id).open('w')
+        position = 0
         try:
         try:
-            for header, position in zip(list(master), range(len(master))):
+            for header in list(master):
 
                 if header.tag in self.skip_header_tags:
                     continue
 
                 if header.tag in self.skip_header_tags:
                     continue
+                if header.tag is etree.Comment:
+                    continue
 
 
-                content = u' '.join([t for t in header.itertext()])
-                content = fix_format(content)
+                # section content
+                content = []
+                footnote = None
 
 
-                doc = add_part(snippets, header_index=position, header_type=header.tag, content=content)
+                for start, end in walker(header, ignore_tags=self.ignore_content_tags):
+                    # handle footnotes
+                    if start is not None and start.tag in self.footnote_tags:
+                        footnote = ' '.join(start.itertext())
+                    elif end is not None and footnote is not None and end.tag in self.footnote_tags:
+                        doc = add_part(snippets, header_index=position, header_type=header.tag,
+                                       content=footnote)
 
 
-                self.index.addDocument(doc)
+                        self.index.addDocument(doc)
+
+                        footnote = None
 
 
-                for start, end in walker(header):
+                    # handle fragments and themes.
                     if start is not None and start.tag == 'begin':
                         fid = start.attrib['id'][1:]
                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
                     if start is not None and start.tag == 'begin':
                         fid = start.attrib['id'][1:]
                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
-                        fragments[fid]['content'].append(start.tail)
+
                     elif start is not None and start.tag == 'motyw':
                         fid = start.attrib['id'][1:]
                         if start.text is not None:
                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
                     elif start is not None and start.tag == 'motyw':
                         fid = start.attrib['id'][1:]
                         if start.text is not None:
                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
-                        fragments[fid]['content'].append(start.tail)
+
                     elif start is not None and start.tag == 'end':
                         fid = start.attrib['id'][1:]
                         if fid not in fragments:
                             continue  # a broken <end> node, skip it
                     elif start is not None and start.tag == 'end':
                         fid = start.attrib['id'][1:]
                         if fid not in fragments:
                             continue  # a broken <end> node, skip it
+                                      #                        import pdb; pdb.set_trace()
                         frag = fragments[fid]
                         if frag['themes'] == []:
                             continue  # empty themes list.
                         del fragments[fid]
 
                         frag = fragments[fid]
                         if frag['themes'] == []:
                             continue  # empty themes list.
                         del fragments[fid]
 
-                        def jstr(l):
-                            return u' '.join(map(
-                                lambda x: x == None and u'(none)' or unicode(x),
-                                l))
-
                         doc = add_part(snippets,
                                        header_type=frag['start_header'],
                                        header_index=frag['start_section'],
                                        header_span=position - frag['start_section'] + 1,
                                        fragment_anchor=fid,
                         doc = add_part(snippets,
                                        header_type=frag['start_header'],
                                        header_index=frag['start_section'],
                                        header_span=position - frag['start_section'] + 1,
                                        fragment_anchor=fid,
-                                       content=u' '.join(filter(lambda s: s is not None, frag['content'])),
+                                       content=fix_format(frag['content']),
                                        themes=frag['themes'])
 
                         self.index.addDocument(doc)
                                        themes=frag['themes'])
 
                         self.index.addDocument(doc)
+
+                        # Collect content.
                     elif start is not None:
                         for frag in fragments.values():
                             frag['content'].append(start.text)
                     elif start is not None:
                         for frag in fragments.values():
                             frag['content'].append(start.text)
+                        content.append(start.text)
                     elif end is not None:
                         for frag in fragments.values():
                             frag['content'].append(end.tail)
                     elif end is not None:
                         for frag in fragments.values():
                             frag['content'].append(end.tail)
+                        content.append(end.tail)
+
+                        # in the end, add a section text.
+                doc = add_part(snippets, header_index=position, header_type=header.tag,
+                               content=fix_format(content))
+
+                self.index.addDocument(doc)
+                position += 1
+
         finally:
             snippets.close()
 
         finally:
             snippets.close()
 
@@ -539,17 +591,20 @@ class JoinSearch(object):
 
 
 class SearchResult(object):
 
 
 class SearchResult(object):
-    def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None):
-        self.snippets = []
+    def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
+        if tokens_cache is None: tokens_cache = {}
 
         if score:
 
         if score:
-            self.score = score
+            self._score = score
         else:
         else:
-            self.score = scoreDocs.score
+            self._score = scoreDocs.score
+
+        self.boost = 1.0
 
 
-        self.hits = []
+        self._hits = []
+        self._processed_hits = None  # processed hits
 
 
-        stored = searcher.doc(scoreDocs.doc)
+        stored = search.searcher.doc(scoreDocs.doc)
         self.book_id = int(stored.get("book_id"))
 
         header_type = stored.get("header_type")
         self.book_id = int(stored.get("book_id"))
 
         header_type = stored.get("header_type")
@@ -562,14 +617,30 @@ class SearchResult(object):
 
         fragment = stored.get("fragment_anchor")
 
 
         fragment = stored.get("fragment_anchor")
 
-        hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': [snippets]})
+        pd = stored.get("published_date")
+        if pd is None:
+            print "published_date is none for book %d" % self.book_id
+            pd = 0
+        self.published_date = int(pd)
+
+        if snippets:
+            snippets = snippets.replace("/\n", "\n")
+        hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 
 
-        self.hits.append(hit)
+        self._hits.append(hit)
+
+        self.search = search
+        self.searched = searched
+        self.tokens_cache = tokens_cache
+
+    @property
+    def score(self):
+        return self._score * self.boost
 
     def merge(self, other):
         if self.book_id != other.book_id:
             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 
     def merge(self, other):
         if self.book_id != other.book_id:
             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
-        self.hits += other.hits
+        self._hits += other._hits
         if other.score > self.score:
             self.score = other.score
         return self
         if other.score > self.score:
             self.score = other.score
         return self
@@ -579,34 +650,89 @@ class SearchResult(object):
 
     book = property(get_book)
 
 
     book = property(get_book)
 
-    def process_hits(self):
-        frags = filter(lambda r: r[1] is not None, self.hits)
-        sect = filter(lambda r: r[1] is None, self.hits)
+    @property
+    def hits(self):
+        if self._processed_hits is not None:
+            return self._processed_hits
+
+        POSITION = 0
+        FRAGMENT = 1
+        POSITION_INDEX = 1
+        POSITION_SPAN = 2
+        SCORE = 2
+        OTHER = 3
+
+        # to sections and fragments
+        frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
+        sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
         sect = filter(lambda s: 0 == len(filter(
         sect = filter(lambda s: 0 == len(filter(
-            lambda f: s[0][1] >= f[0][1] and s[0][1] < f[0][1] + f[0][2],
+            lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
+            and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
             frags)), sect)
 
         hits = []
 
             frags)), sect)
 
         hits = []
 
+        # remove duplicate fragments
+        fragments = {}
+        for f in frags:
+            fid = f[FRAGMENT]
+            if fid in fragments:
+                if fragments[fid][SCORE] >= f[SCORE]:
+                    continue
+            fragments[fid] = f
+        frags = fragments.values()
+
+        # remove duplicate sections
+        sections = {}
+
         for s in sect:
         for s in sect:
-            m = {'score': s[2],
-                 'header_index': s[0][1]
+            si = s[POSITION][POSITION_INDEX]
+            # skip existing
+            if si in sections:
+                if sections[si]['score'] >= s[SCORE]:
+                    continue
+
+            m = {'score': s[SCORE],
+                 'section_number': s[POSITION][POSITION_INDEX] + 1,
                  }
                  }
-            m.update(s[3])
-            hits.append(m)
+            m.update(s[OTHER])
+            sections[si] = m
+
+        hits = sections.values()
 
         for f in frags:
 
         for f in frags:
-            frag = catalogue.models.Fragment.objects.get(anchor=f[1])
-            m = {'score': f[2],
+            try:
+                frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
+            except catalogue.models.Fragment.DoesNotExist:
+                # stale index
+                continue
+
+            # Figure out if we were searching for a token matching some word in theme name.
+            themes = frag.tags.filter(category='theme')
+            themes_hit = []
+            if self.searched is not None:
+                tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
+                for theme in themes:
+                    name_tokens = self.search.get_tokens(theme.name, 'POLISH')
+                    print "THEME HIT: %s in %s" % (tokens, name_tokens)
+                    for t in tokens:
+                        if t in name_tokens:
+                            if not theme in themes_hit:
+                                themes_hit.append(theme)
+                            break
+
+            m = {'score': f[SCORE],
                  'fragment': frag,
                  'fragment': frag,
-                 'themes': frag.tags.filter(category='theme')
+                 'section_number': f[POSITION][POSITION_INDEX] + 1,
+                 'themes': themes,
+                 'themes_hit': themes_hit
                  }
                  }
-            m.update(f[3])
+            m.update(f[OTHER])
             hits.append(m)
 
         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 
             hits.append(m)
 
         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 
-        print("--- %s" % hits)
+        self._processed_hits = hits
 
         return hits
 
 
         return hits
 
@@ -626,7 +752,12 @@ class SearchResult(object):
         return books.values()
 
     def __cmp__(self, other):
         return books.values()
 
     def __cmp__(self, other):
-        return cmp(self.score, other.score)
+        c = cmp(self.score, other.score)
+        if c == 0:
+            # this is inverted, because earlier date is better
+            return cmp(other.published_date, self.published_date)
+        else:
+            return c
 
 
 class Hint(object):
 
 
 class Hint(object):
@@ -660,7 +791,7 @@ class Hint(object):
                 lst = self.book_tags.get(t.category, [])
                 lst.append(t)
                 self.book_tags[t.category] = lst
                 lst = self.book_tags.get(t.category, [])
                 lst.append(t)
                 self.book_tags[t.category] = lst
-            if t.category in ['theme']:
+            if t.category in ['theme', 'theme_pl']:
                 self.part_tags.append(t)
 
     def tag_filter(self, tags, field='tags'):
                 self.part_tags.append(t)
 
     def tag_filter(self, tags, field='tags'):
@@ -756,11 +887,14 @@ class Search(IndexStore):
             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
         return (bks, tops.totalHits)
 
             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
         return (bks, tops.totalHits)
 
-    def get_tokens(self, searched, field='content'):
+    def get_tokens(self, searched, field='content', cached=None):
         """returns tokens analyzed by a proper (for a field) analyzer
         argument can be: StringReader, string/unicode, or tokens. In the last case
         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
         """
         """returns tokens analyzed by a proper (for a field) analyzer
         argument can be: StringReader, string/unicode, or tokens. In the last case
         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
         """
+        if cached is not None and field in cached:
+            return cached[field]
+
         if isinstance(searched, str) or isinstance(searched, unicode):
             searched = StringReader(searched)
         elif isinstance(searched, list):
         if isinstance(searched, str) or isinstance(searched, unicode):
             searched = StringReader(searched)
         elif isinstance(searched, list):
@@ -772,6 +906,10 @@ class Search(IndexStore):
         while tokens.incrementToken():
             cta = tokens.getAttribute(CharTermAttribute.class_)
             toks.append(cta.toString())
         while tokens.incrementToken():
             cta = tokens.getAttribute(CharTermAttribute.class_)
             toks.append(cta.toString())
+
+        if cached is not None:
+            cached[field] = toks
+
         return toks
 
     def fuzziness(self, fuzzy):
         return toks
 
     def fuzziness(self, fuzzy):
@@ -828,9 +966,40 @@ class Search(IndexStore):
             q.add(BooleanClause(term, modal))
         return q
 
             q.add(BooleanClause(term, modal))
         return q
 
-    # def content_query(self, query):
-    #     return BlockJoinQuery(query, self.parent_filter,
-    #                           BlockJoinQuery.ScoreMode.Total)
+    def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
+                      filters=None, tokens_cache=None, boost=None, snippets=False):
+        if filters is None: filters = []
+        if tokens_cache is None: tokens_cache = {}
+
+        tokens = self.get_tokens(searched, field, cached=tokens_cache)
+
+        query = self.make_phrase(tokens, field=field, fuzzy=fuzzy)
+        if book:
+            filters.append(self.term_filter(Term('is_book', 'true')))
+        top = self.searcher.search(query, self.chain_filters(filters), max_results)
+
+        return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
+
+    def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
+                    filters=None, tokens_cache=None, boost=None):
+        if filters is None: filters = []
+        if tokens_cache is None: tokens_cache = {}
+
+        if book:
+            filters.append(self.term_filter(Term('is_book', 'true')))
+
+        query = BooleanQuery()
+
+        for fld in fields:
+            tokens = self.get_tokens(searched, fld, cached=tokens_cache)
+
+            query.add(BooleanClause(self.make_term_query(tokens, field=fld,
+                                fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
+
+        top = self.searcher.search(query, self.chain_filters(filters), max_results)
+
+        return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
+                             snippets=self.get_snippets(found, query)) for found in top.scoreDocs]
 
     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
         """
 
     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
         """
@@ -853,12 +1022,39 @@ class Search(IndexStore):
                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
                 max_results)
             for found in top.scoreDocs:
                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
                 max_results)
             for found in top.scoreDocs:
-                books.append(SearchResult(self.searcher, found))
+                books.append(SearchResult(self, found, how_found="search_perfect_book"))
+        return books
+
+    def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
+        fields_to_search = ['tags', 'authors', 'title']
+
+        only_in = None
+        if hint:
+            if not hint.should_search_for_book():
+                return []
+            fields_to_search = hint.just_search_in(fields_to_search)
+            only_in = hint.book_filter()
+
+        tokens = self.get_tokens(searched, field='SIMPLE')
+
+        q = BooleanQuery()
+
+        for fld in fields_to_search:
+            q.add(BooleanClause(self.make_term_query(tokens, field=fld,
+                                fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
+
+        books = []
+        top = self.searcher.search(q,
+                                   self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
+            max_results)
+        for found in top.scoreDocs:
+            books.append(SearchResult(self, found, how_found="search_book"))
+
         return books
 
     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
         """
         return books
 
     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
         """
-        Search for book parts which containt a phrase perfectly matching (with a slop of 2, default for make_phrase())
+        Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
         some part/fragment of the book.
         """
         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
         some part/fragment of the book.
         """
         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
@@ -874,16 +1070,17 @@ class Search(IndexStore):
                                                            flt]),
                                        max_results)
             for found in top.scoreDocs:
                                                            flt]),
                                        max_results)
             for found in top.scoreDocs:
-                books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q)))
+                books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
 
         return books
 
 
         return books
 
-    def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None):
+    def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
         """
         Tries to use search terms to match different fields of book (or its parts).
         E.g. one word can be an author survey, another be a part of the title, and the rest
         are some words from third chapter.
         """
         """
         Tries to use search terms to match different fields of book (or its parts).
         E.g. one word can be an author survey, another be a part of the title, and the rest
         are some words from third chapter.
         """
+        if tokens_cache is None: tokens_cache = {}
         books = []
         only_in = None
 
         books = []
         only_in = None
 
@@ -893,29 +1090,40 @@ class Search(IndexStore):
         # content only query : themes x content
         q = BooleanQuery()
 
         # content only query : themes x content
         q = BooleanQuery()
 
-        tokens = self.get_tokens(searched)
-        if hint is None or hint.just_search_in(['themes_pl']) != []:
-            q.add(BooleanClause(self.make_term_query(tokens, field='themes_pl',
+        tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
+        tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
+
+        # only search in themes when we do not already filter by themes
+        if hint is None or hint.just_search_in(['themes']) != []:
+            q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
 
                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
 
-        q.add(BooleanClause(self.make_term_query(tokens, field='content',
+        q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 
         topDocs = self.searcher.search(q, only_in, max_results)
         for found in topDocs.scoreDocs:
                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 
         topDocs = self.searcher.search(q, only_in, max_results)
         for found in topDocs.scoreDocs:
-            books.append(SearchResult(self.searcher, found))
+            books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
+            print "* %s theme x content: %s" % (searched, books[-1]._hits)
 
         # query themes/content x author/title/tags
         q = BooleanQuery()
 
         # query themes/content x author/title/tags
         q = BooleanQuery()
-        #        in_meta = BooleanQuery()
         in_content = BooleanQuery()
         in_content = BooleanQuery()
+        in_meta = BooleanQuery()
 
 
-        for fld in ['themes', 'content', 'tags', 'authors', 'title']:
-            in_content.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
+        for fld in ['themes_pl', 'content']:
+            in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
+
+        for fld in ['tags', 'authors', 'title']:
+            in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
+
+        q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
+        q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
 
         topDocs = self.searcher.search(q, only_in, max_results)
         for found in topDocs.scoreDocs:
 
         topDocs = self.searcher.search(q, only_in, max_results)
         for found in topDocs.scoreDocs:
-            books.append(SearchResult(self.searcher, found))
+            books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
+            print "* %s scatter search: %s" % (searched, books[-1]._hits)
 
         return books
 
 
         return books
 
@@ -961,7 +1169,6 @@ class Search(IndexStore):
 
         # return None
 
 
         # return None
 
-
     def get_snippets(self, scoreDoc, query, field='content'):
         """
         Returns a snippet for found scoreDoc.
     def get_snippets(self, scoreDoc, query, field='content'):
         """
         Returns a snippet for found scoreDoc.
@@ -971,17 +1178,20 @@ class Search(IndexStore):
 
         stored = self.searcher.doc(scoreDoc.doc)
 
 
         stored = self.searcher.doc(scoreDoc.doc)
 
+        position = stored.get('snippets_position')
+        length = stored.get('snippets_length')
+        if position is None or length is None:
+            return None
         # locate content.
         snippets = Snippets(stored.get('book_id')).open()
         try:
         # locate content.
         snippets = Snippets(stored.get('book_id')).open()
         try:
-            text = snippets.get((int(stored.get('snippets_position')),
-                                 int(stored.get('snippets_length'))))
+            text = snippets.get((int(position),
+                                 int(length)))
         finally:
             snippets.close()
 
         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
         finally:
             snippets.close()
 
         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
-        #        import pdb; pdb.set_trace()
         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
 
         return snip
         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
 
         return snip
@@ -1089,7 +1299,7 @@ class Search(IndexStore):
         Chains a filter list together
         """
         filters = filter(lambda x: x is not None, filters)
         Chains a filter list together
         """
         filters = filter(lambda x: x is not None, filters)
-        if not filters:
+        if not filters or filters is []:
             return None
         chf = ChainedFilter(JArray('object')(filters, Filter), op)
         return chf
             return None
         chf = ChainedFilter(JArray('object')(filters, Filter), op)
         return chf