split catalogue.models

[wolnelektury.git] / apps / search / index.py
diff --git a/apps/search/index.py b/apps/search/index.py

index 77ce877..a0bf715 100644 (file)
--- a/apps/search/index.py
+++ b/apps/search/index.py
@@ -1,7 +1,8 @@
  # -*- coding: utf-8 -*-
  
  from django.conf import settings
  # -*- coding: utf-8 -*-
  
  from django.conf import settings
-from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
+from django.dispatch import Signal
+from lucene import SimpleFSDirectory, NIOFSDirectory, IndexWriter, IndexReader, IndexWriterConfig, CheckIndex, \
      File, Field, Integer, \
      NumericField, Version, Document, JavaError, IndexSearcher, \
      QueryParser, PerFieldAnalyzerWrapper, \
      File, Field, Integer, \
      NumericField, Version, Document, JavaError, IndexSearcher, \
      QueryParser, PerFieldAnalyzerWrapper, \
@@ -17,7 +18,7 @@ from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
      # KeywordAnalyzer
  
  # Initialize jvm
      # KeywordAnalyzer
  
  # Initialize jvm
-JVM = initVM(CLASSPATH)
+JVM = initVM(CLASSPATH, maxheap=settings.JVM_MAXHEAP)
  
  import sys
  import os
  
  import sys
  import os
@@ -25,12 +26,16 @@ import re
  import errno
  from librarian import dcparser
  from librarian.parser import WLDocument
  import errno
  from librarian import dcparser
  from librarian.parser import WLDocument
+from lxml import etree
  import catalogue.models
  import catalogue.models
+from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  from multiprocessing.pool import ThreadPool
  from threading import current_thread
  from multiprocessing.pool import ThreadPool
  from threading import current_thread
+from itertools import chain
  import atexit
  import traceback
  import atexit
  import traceback
-
+import logging
+log = logging.getLogger('search')
  
  class WLAnalyzer(PerFieldAnalyzerWrapper):
      def __init__(self):
  
  class WLAnalyzer(PerFieldAnalyzerWrapper):
      def __init__(self):
@@ -54,6 +59,8 @@ class WLAnalyzer(PerFieldAnalyzerWrapper):
          self.addAnalyzer("source_name", simple)
          self.addAnalyzer("publisher", simple)
          self.addAnalyzer("authors", simple)
          self.addAnalyzer("source_name", simple)
          self.addAnalyzer("publisher", simple)
          self.addAnalyzer("authors", simple)
+        self.addAnalyzer("title", simple)
+
          self.addAnalyzer("is_book", keyword)
          # shouldn't the title have two forms? _pl and simple?
  
          self.addAnalyzer("is_book", keyword)
          # shouldn't the title have two forms? _pl and simple?
  
@@ -78,7 +85,7 @@ class IndexStore(object):
      """
      def __init__(self):
          self.make_index_dir()
      """
      def __init__(self):
          self.make_index_dir()
-        self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
+        self.store = NIOFSDirectory(File(settings.SEARCH_INDEX))
  
      def make_index_dir(self):
          try:
  
      def make_index_dir(self):
          try:
@@ -88,6 +95,9 @@ class IndexStore(object):
                  pass
              else: raise
  
                  pass
              else: raise
  
+    def close(self):
+        self.store.close()
+
  
  class IndexChecker(IndexStore):
      def __init__(self):
  
  class IndexChecker(IndexStore):
      def __init__(self):
@@ -107,7 +117,7 @@ class Snippets(object):
      """
      SNIPPET_DIR = "snippets"
  
      """
      SNIPPET_DIR = "snippets"
  
-    def __init__(self, book_id):
+    def __init__(self, book_id, revision=None):
          try:
              os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
          except OSError as exc:
          try:
              os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
          except OSError as exc:
@@ -115,15 +125,32 @@ class Snippets(object):
                  pass
              else: raise
          self.book_id = book_id
                  pass
              else: raise
          self.book_id = book_id
+        self.revision = revision
          self.file = None
  
          self.file = None
  
+    @property
+    def path(self):
+        if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
+        else: fn = "%d" % self.book_id
+
+        return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
+
      def open(self, mode='r'):
          """
          Open the snippet file. Call .close() afterwards.
          """
          if not 'b' in mode:
              mode += 'b'
      def open(self, mode='r'):
          """
          Open the snippet file. Call .close() afterwards.
          """
          if not 'b' in mode:
              mode += 'b'
-        self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
+
+        if 'w' in mode:
+            if os.path.exists(self.path):
+                self.revision = 1
+                while True:
+                    if not os.path.exists(self.path):
+                        break
+                    self.revision += 1
+
+        self.file = open(self.path, mode)
          self.position = 0
          return self
  
          self.position = 0
          return self
  
@@ -152,6 +179,17 @@ class Snippets(object):
          """Close snippet file"""
          self.file.close()
  
          """Close snippet file"""
          self.file.close()
  
+    def remove(self):
+        self.revision = None
+        try:
+            os.unlink(self.path)
+            self.revision = 0
+            while True:
+                self.revision += 1
+                os.unlink(self.path)
+        except OSError:
+            pass
+
  
  class BaseIndex(IndexStore):
      """
  
  class BaseIndex(IndexStore):
      """
@@ -165,11 +203,13 @@ class BaseIndex(IndexStore):
              analyzer = WLAnalyzer()
          self.analyzer = analyzer
  
              analyzer = WLAnalyzer()
          self.analyzer = analyzer
  
-    def open(self, analyzer=None):
+    def open(self, timeout=None):
          if self.index:
              raise Exception("Index is already opened")
          if self.index:
              raise Exception("Index is already opened")
-        self.index = IndexWriter(self.store, self.analyzer,\
-                                 IndexWriter.MaxFieldLength.LIMITED)
+        conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer)
+        if timeout:
+            conf.setWriteLockTimeout(long(timeout))
+        self.index = IndexWriter(self.store, conf)
          return self.index
  
      def optimize(self):
          return self.index
  
      def optimize(self):
@@ -179,11 +219,15 @@ class BaseIndex(IndexStore):
          try:
              self.index.optimize()
          except JavaError, je:
          try:
              self.index.optimize()
          except JavaError, je:
-            print "Error during optimize phase, check index: %s" % je
+            log.error("Error during optimize phase, check index: %s" % je)
  
          self.index.close()
          self.index = None
  
  
          self.index.close()
          self.index = None
  
+        index_changed.send_robust(self)
+
+        super(BaseIndex, self).close()
+
      def __enter__(self):
          self.open()
          return self
      def __enter__(self):
          self.open()
          return self
@@ -192,6 +236,9 @@ class BaseIndex(IndexStore):
          self.close()
  
  
          self.close()
  
  
+index_changed = Signal()
+
+
  class Index(BaseIndex):
      """
      Class indexing books.
  class Index(BaseIndex):
      """
      Class indexing books.
@@ -199,39 +246,92 @@ class Index(BaseIndex):
      def __init__(self, analyzer=None):
          super(Index, self).__init__(analyzer)
  
      def __init__(self, analyzer=None):
          super(Index, self).__init__(analyzer)
  
-    def index_tags(self):
+    def index_tags(self, *tags, **kw):
          """
          Re-index global tag list.
          Removes all tags from index, then index them again.
          Indexed fields include: id, name (with and without polish stems), category
          """
          """
          Re-index global tag list.
          Removes all tags from index, then index them again.
          Indexed fields include: id, name (with and without polish stems), category
          """
-        q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
-        self.index.deleteDocuments(q)
+        remove_only = kw.get('remove_only', False)
+        # first, remove tags from index.
+        if tags:
+            q = BooleanQuery()
+            for tag in tags:
+                b_id_cat = BooleanQuery()
  
  
-        for tag in catalogue.models.Tag.objects.all():
-            doc = Document()
-            doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(tag.id))
-            doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
-            doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
-            doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
-            self.index.addDocument(doc)
+                q_id = NumericRangeQuery.newIntRange("tag_id", tag.id, tag.id, True, True)
+                b_id_cat.add(q_id, BooleanClause.Occur.MUST)
+
+                if isinstance(tag, PDCounterAuthor):
+                    q_cat = TermQuery(Term('tag_category', 'pd_author'))
+                elif isinstance(tag, PDCounterBook):
+                    q_cat = TermQuery(Term('tag_category', 'pd_book'))
+                else:
+                    q_cat = TermQuery(Term('tag_category', tag.category))
+                b_id_cat.add(q_cat, BooleanClause.Occur.MUST)
+
+                q.add(b_id_cat, BooleanClause.Occur.SHOULD)
+        else:  # all
+            q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
+            self.index.deleteDocuments(q)
+
+        if not remove_only:
+            # then add them [all or just one passed]
+            if not tags:
+                tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
+                    PDCounterAuthor.objects.all(), \
+                    PDCounterBook.objects.all())
+
+            for tag in tags:
+                if isinstance(tag, PDCounterAuthor):
+                    doc = Document()
+                    doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
+                    doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
+                    doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
+                    doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED))
+                    doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
+                    self.index.addDocument(doc)
+                elif isinstance(tag, PDCounterBook):
+                    doc = Document()
+                    doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
+                    doc.add(Field("tag_name", tag.title, Field.Store.NO, Field.Index.ANALYZED))
+                    doc.add(Field("tag_name_pl", tag.title, Field.Store.NO, Field.Index.ANALYZED))
+                    doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED))
+                    doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
+                    self.index.addDocument(doc)
+                else:
+                    doc = Document()
+                    doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
+                    doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
+                    doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
+                    doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
+                    self.index.addDocument(doc)
  
      def create_book_doc(self, book):
          """
          Create a lucene document referring book id.
          """
          doc = Document()
  
      def create_book_doc(self, book):
          """
          Create a lucene document referring book id.
          """
          doc = Document()
-        doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
+        doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
          if book.parent is not None:
          if book.parent is not None:
-            doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
+            doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
          return doc
  
          return doc
  
-    def remove_book(self, book):
+    def remove_book(self, book_or_id, remove_snippets=True):
          """Removes a book from search index.
          book - Book instance."""
          """Removes a book from search index.
          book - Book instance."""
-        q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
+        if isinstance(book_or_id, catalogue.models.Book):
+            book_id = book_or_id.id
+        else:
+            book_id = book_or_id
+
+        q = NumericRangeQuery.newIntRange("book_id", book_id, book_id, True, True)
          self.index.deleteDocuments(q)
  
          self.index.deleteDocuments(q)
  
+        if remove_snippets:
+            snippets = Snippets(book_id)
+            snippets.remove()
+
      def index_book(self, book, book_info=None, overwrite=True):
          """
          Indexes the book.
      def index_book(self, book, book_info=None, overwrite=True):
          """
          Indexes the book.
@@ -239,21 +339,26 @@ class Index(BaseIndex):
          and calls self.index_content() to index the contents of the book.
          """
          if overwrite:
          and calls self.index_content() to index the contents of the book.
          """
          if overwrite:
-            self.remove_book(book)
+            # we don't remove snippets, since they might be still needed by
+            # threads using not reopened index
+            self.remove_book(book, remove_snippets=False)
  
          book_doc = self.create_book_doc(book)
  
          book_doc = self.create_book_doc(book)
-        meta_fields = self.extract_metadata(book, book_info)
+        meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
+        # let's not index it - it's only used for extracting publish date
+        if 'source_name' in meta_fields:
+            del meta_fields['source_name']
+        
          for f in meta_fields.values():
              if isinstance(f, list) or isinstance(f, tuple):
                  for elem in f:
                      book_doc.add(elem)
              else:
                  book_doc.add(f)
          for f in meta_fields.values():
              if isinstance(f, list) or isinstance(f, tuple):
                  for elem in f:
                      book_doc.add(elem)
              else:
                  book_doc.add(f)
-
          self.index.addDocument(book_doc)
          del book_doc
  
          self.index.addDocument(book_doc)
          del book_doc
  
-        self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors']])
+        self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
  
      master_tags = [
          'opowiadanie',
  
      master_tags = [
          'opowiadanie',
@@ -261,12 +366,23 @@ class Index(BaseIndex):
          'dramat_wierszowany_l',
          'dramat_wierszowany_lp',
          'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
          'dramat_wierszowany_l',
          'dramat_wierszowany_lp',
          'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
-        'wywiad'
+        'wywiad',
+        ]
+
+    ignore_content_tags = [
+        'uwaga', 'extra',
+        'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
+        'didaskalia',
+        'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
          ]
  
          ]
  
-    skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
+    footnote_tags = ['pa', 'pt', 'pr', 'pe']
+
+    skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
  
  
-    def extract_metadata(self, book, book_info=None):
+    published_date_re = re.compile("([0-9]+)[\]. ]*$")
+
+    def extract_metadata(self, book, book_info=None, dc_only=None):
          """
          Extract metadata from book and returns a map of fields keyed by fieldname
          """
          """
          Extract metadata from book and returns a map of fields keyed by fieldname
          """
@@ -281,6 +397,8 @@ class Index(BaseIndex):
  
          # validator, name
          for field in dcparser.BookInfo.FIELDS:
  
          # validator, name
          for field in dcparser.BookInfo.FIELDS:
+            if dc_only and field.name not in dc_only:
+                continue
              if hasattr(book_info, field.name):
                  if not getattr(book_info, field.name):
                      continue
              if hasattr(book_info, field.name):
                  if not getattr(book_info, field.name):
                      continue
@@ -306,6 +424,15 @@ class Index(BaseIndex):
                      fields[field.name] = Field(field.name, "%04d%02d%02d" %\
                                                 (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
  
                      fields[field.name] = Field(field.name, "%04d%02d%02d" %\
                                                 (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
  
+        # get published date
+        pd = None
+        if hasattr(book_info, 'source_name') and book_info.source_name:
+            match = self.published_date_re.search(book_info.source_name)
+            if match is not None:
+                pd = str(match.groups()[0])
+        if not pd: pd = ""
+        fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
+
          return fields
  
      def add_gaps(self, fields, fieldname):
          return fields
  
      def add_gaps(self, fields, fieldname):
@@ -338,15 +465,33 @@ class Index(BaseIndex):
          if master is None:
              return []
  
          if master is None:
              return []
  
-        def walker(node):
-            yield node, None
-            for child in list(node):
-                for b, e in walker(child):
-                    yield b, e
-            yield None, node
+        def walker(node, ignore_tags=[]):
+
+            if node.tag not in ignore_tags:
+                yield node, None, None
+                if node.text is not None:
+                    yield None, node.text, None
+                for child in list(node):
+                    for b, t, e in walker(child):
+                        yield b, t, e
+                yield None, None, node
+
+            if node.tail is not None:
+                yield None, node.tail, None
              return
  
          def fix_format(text):
              return
  
          def fix_format(text):
+            #            separator = [u" ", u"\t", u".", u";", u","]
+            if isinstance(text, list):
+                # need to join it first
+                text = filter(lambda s: s is not None, content)
+                text = u' '.join(text)
+                # for i in range(len(text)):
+                #     if i > 0:
+                #         if text[i][0] not in separator\
+                #             and text[i - 1][-1] not in separator:
+                #          text.insert(i, u" ")
+
              return re.sub("(?m)/$", "", text)
  
          def add_part(snippets, **fields):
              return re.sub("(?m)/$", "", text)
  
          def add_part(snippets, **fields):
@@ -365,6 +510,8 @@ class Index(BaseIndex):
              snip_pos = snippets.add(fields["content"])
              doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
              doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
              snip_pos = snippets.add(fields["content"])
              doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
              doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
+            if snippets.revision:
+                doc.add(NumericField("snippets_revision", Field.Store.YES, True).setIntValue(snippets.revision))
  
              if 'fragment_anchor' in fields:
                  doc.add(Field("fragment_anchor", fields['fragment_anchor'],
  
              if 'fragment_anchor' in fields:
                  doc.add(Field("fragment_anchor", fields['fragment_anchor'],
@@ -399,24 +546,51 @@ class Index(BaseIndex):
  
                  if header.tag in self.skip_header_tags:
                      continue
  
                  if header.tag in self.skip_header_tags:
                      continue
+                if header.tag is etree.Comment:
+                    continue
  
  
-                content = u' '.join([t for t in header.itertext()])
-                content = fix_format(content)
-
-                doc = add_part(snippets, header_index=position, header_type=header.tag, content=content)
-
-                self.index.addDocument(doc)
-
-                for start, end in walker(header):
+                # section content
+                content = []
+                footnote = []
+
+                def all_content(text):
+                    for frag in fragments.values():
+                        frag['content'].append(text)
+                    content.append(text)
+                handle_text = [all_content]
+
+
+                for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
+                    # handle footnotes
+                    if start is not None and start.tag in self.footnote_tags:
+                        footnote = []
+                        def collect_footnote(t):
+                            footnote.append(t)
+                        handle_text.append(collect_footnote)
+                    elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
+                        handle_text.pop()
+                        doc = add_part(snippets, header_index=position, header_type=header.tag,
+                                       content=u''.join(footnote),
+                                       is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
+                
+                        self.index.addDocument(doc)
+                        #print "@ footnote text: %s" % footnote
+                        footnote = []
+                    
+                    # handle fragments and themes.
                      if start is not None and start.tag == 'begin':
                          fid = start.attrib['id'][1:]
                          fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
                      if start is not None and start.tag == 'begin':
                          fid = start.attrib['id'][1:]
                          fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
-                        fragments[fid]['content'].append(start.tail)
+
+                    # themes for this fragment
                      elif start is not None and start.tag == 'motyw':
                          fid = start.attrib['id'][1:]
                      elif start is not None and start.tag == 'motyw':
                          fid = start.attrib['id'][1:]
+                        handle_text.append(None)
                          if start.text is not None:
                              fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
                          if start.text is not None:
                              fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
-                        fragments[fid]['content'].append(start.tail)
+                    elif end is not None and end.tag == 'motyw':
+                        handle_text.pop()
+
                      elif start is not None and start.tag == 'end':
                          fid = start.attrib['id'][1:]
                          if fid not in fragments:
                      elif start is not None and start.tag == 'end':
                          fid = start.attrib['id'][1:]
                          if fid not in fragments:
@@ -426,26 +600,30 @@ class Index(BaseIndex):
                              continue  # empty themes list.
                          del fragments[fid]
  
                              continue  # empty themes list.
                          del fragments[fid]
  
-                        def jstr(l):
-                            return u' '.join(map(
-                                lambda x: x == None and u'(none)' or unicode(x),
-                                l))
-
                          doc = add_part(snippets,
                                         header_type=frag['start_header'],
                                         header_index=frag['start_section'],
                                         header_span=position - frag['start_section'] + 1,
                                         fragment_anchor=fid,
                          doc = add_part(snippets,
                                         header_type=frag['start_header'],
                                         header_index=frag['start_section'],
                                         header_span=position - frag['start_section'] + 1,
                                         fragment_anchor=fid,
-                                       content=u' '.join(filter(lambda s: s is not None, frag['content'])),
+                                       content=fix_format(frag['content']),
                                         themes=frag['themes'])
                                         themes=frag['themes'])
-
+                        #print '@ FRAG %s' % frag['content']
                          self.index.addDocument(doc)
                          self.index.addDocument(doc)
-                    elif start is not None:
-                        for frag in fragments.values():
-                            frag['content'].append(start.text)
-                    elif end is not None:
-                        for frag in fragments.values():
-                            frag['content'].append(end.tail)
+
+                        # Collect content.
+
+                    if text is not None and handle_text is not []:
+                        hdl = handle_text[-1]
+                        if hdl is not None:
+                            hdl(text)
+
+                        # in the end, add a section text.
+                doc = add_part(snippets, header_index=position, header_type=header.tag,
+                               content=fix_format(content))
+                #print '@ CONTENT: %s' % fix_format(content)
+
+                self.index.addDocument(doc)
+
          finally:
              snippets.close()
  
          finally:
              snippets.close()
  
@@ -455,7 +633,7 @@ def log_exception_wrapper(f):
          try:
              f(*a)
          except Exception, e:
          try:
              f(*a)
          except Exception, e:
-            print("Error in indexing thread: %s" % e)
+            log.error("Error in indexing thread: %s" % e)
              traceback.print_exc()
              raise e
      return _wrap
              traceback.print_exc()
              raise e
      return _wrap
@@ -471,12 +649,11 @@ class ReusableIndex(Index):
      """
      index = None
  
      """
      index = None
  
-    def open(self, analyzer=None, threads=4):
-        if ReusableIndex.index is not None:
+    def open(self, analyzer=None, **kw):
+        if ReusableIndex.index:
              self.index = ReusableIndex.index
          else:
              self.index = ReusableIndex.index
          else:
-            print("opening index")
-            Index.open(self, analyzer)
+            Index.open(self, analyzer, **kw)
              ReusableIndex.index = self.index
              atexit.register(ReusableIndex.close_reusable)
  
              ReusableIndex.index = self.index
              atexit.register(ReusableIndex.close_reusable)
  
@@ -486,13 +663,16 @@ class ReusableIndex(Index):
  
      @staticmethod
      def close_reusable():
  
      @staticmethod
      def close_reusable():
-        if ReusableIndex.index is not None:
+        if ReusableIndex.index:
              ReusableIndex.index.optimize()
              ReusableIndex.index.close()
              ReusableIndex.index = None
  
              ReusableIndex.index.optimize()
              ReusableIndex.index.close()
              ReusableIndex.index = None
  
+            index_changed.send_robust(None)
+
      def close(self):
      def close(self):
-        pass
+        if ReusableIndex.index:
+            ReusableIndex.index.commit()
  
  
  class JoinSearch(object):
  
  
  class JoinSearch(object):
@@ -539,74 +719,156 @@ class JoinSearch(object):
  
  
  class SearchResult(object):
  
  
  class SearchResult(object):
-    def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None):
-        self.snippets = []
+    def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
+        if tokens_cache is None: tokens_cache = {}
  
          if score:
  
          if score:
-            self.score = score
+            self._score = score
          else:
          else:
-            self.score = scoreDocs.score
+            self._score = scoreDocs.score
+
+        self.boost = 1.0
  
  
-        self.hits = []
+        self._hits = []
+        self._processed_hits = None  # processed hits
  
  
-        stored = searcher.doc(scoreDocs.doc)
+        stored = search.searcher.doc(scoreDocs.doc)
          self.book_id = int(stored.get("book_id"))
  
          self.book_id = int(stored.get("book_id"))
  
+        pd = stored.get("published_date")
+        try:
+            self.published_date = int(pd)
+        except ValueError:
+            self.published_date = 0
+
          header_type = stored.get("header_type")
          header_type = stored.get("header_type")
-        if not header_type:
-            return
+        # we have a content hit in some header of fragment
+        if header_type is not None:
+            sec = (header_type, int(stored.get("header_index")))
+            header_span = stored.get('header_span')
+            header_span = header_span is not None and int(header_span) or 1
  
  
-        sec = (header_type, int(stored.get("header_index")))
-        header_span = stored.get('header_span')
-        header_span = header_span is not None and int(header_span) or 1
+            fragment = stored.get("fragment_anchor")
  
  
-        fragment = stored.get("fragment_anchor")
+            if snippets:
+                snippets = snippets.replace("/\n", "\n")
+            hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
  
  
-        hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': [snippets]})
+            self._hits.append(hit)
  
  
-        self.hits.append(hit)
+        self.search = search
+        self.searched = searched
+        self.tokens_cache = tokens_cache
+
+    @property
+    def score(self):
+        return self._score * self.boost
  
      def merge(self, other):
          if self.book_id != other.book_id:
              raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
  
      def merge(self, other):
          if self.book_id != other.book_id:
              raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
-        self.hits += other.hits
+        self._hits += other._hits
          if other.score > self.score:
          if other.score > self.score:
-            self.score = other.score
+            self._score = other._score
          return self
  
      def get_book(self):
          return self
  
      def get_book(self):
+        if hasattr(self, '_book'):
+            return self._book
          return catalogue.models.Book.objects.get(id=self.book_id)
  
      book = property(get_book)
  
          return catalogue.models.Book.objects.get(id=self.book_id)
  
      book = property(get_book)
  
-    def process_hits(self):
-        frags = filter(lambda r: r[1] is not None, self.hits)
-        sect = filter(lambda r: r[1] is None, self.hits)
+    @property
+    def hits(self):
+        if self._processed_hits is not None:
+            return self._processed_hits
+
+        POSITION = 0
+        FRAGMENT = 1
+        POSITION_INDEX = 1
+        POSITION_SPAN = 2
+        SCORE = 2
+        OTHER = 3
+
+        # to sections and fragments
+        frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
+
+        sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
+
+        # sections not covered by fragments
          sect = filter(lambda s: 0 == len(filter(
          sect = filter(lambda s: 0 == len(filter(
-            lambda f: s[0][1] >= f[0][1] and s[0][1] < f[0][1] + f[0][2],
+            lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
+            and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
              frags)), sect)
  
          hits = []
  
              frags)), sect)
  
          hits = []
  
+        def remove_duplicates(lst, keyfn, compare):
+            els = {}
+            for e in lst:
+                eif = keyfn(e)
+                if eif in els:
+                    if compare(els[eif], e) >= 1:
+                        continue
+                els[eif] = e
+            return els.values()
+
+        # remove fragments with duplicated fid's and duplicated snippets
+        frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE]))
+        frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or f[FRAGMENT],
+                                  lambda a, b: cmp(a[SCORE], b[SCORE]))
+
+        # remove duplicate sections
+        sections = {}
+
          for s in sect:
          for s in sect:
-            m = {'score': s[2],
-                 'header_index': s[0][1]
+            si = s[POSITION][POSITION_INDEX]
+            # skip existing
+            if si in sections:
+                if sections[si]['score'] >= s[SCORE]:
+                    continue
+
+            m = {'score': s[SCORE],
+                 'section_number': s[POSITION][POSITION_INDEX] + 1,
                   }
                   }
-            m.update(s[3])
-            hits.append(m)
+            m.update(s[OTHER])
+            sections[si] = m
+
+        hits = sections.values()
  
          for f in frags:
  
          for f in frags:
-            frag = catalogue.models.Fragment.objects.get(anchor=f[1])
-            m = {'score': f[2],
+            try:
+                frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
+            except catalogue.models.Fragment.DoesNotExist:
+                # stale index
+                continue
+
+            # Figure out if we were searching for a token matching some word in theme name.
+            themes = frag.tags.filter(category='theme')
+            themes_hit = []
+            if self.searched is not None:
+                tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
+                for theme in themes:
+                    name_tokens = self.search.get_tokens(theme.name, 'POLISH')
+                    for t in tokens:
+                        if t in name_tokens:
+                            if not theme in themes_hit:
+                                themes_hit.append(theme)
+                            break
+
+            m = {'score': f[SCORE],
                   'fragment': frag,
                   'fragment': frag,
-                 'themes': frag.tags.filter(category='theme')
+                 'section_number': f[POSITION][POSITION_INDEX] + 1,
+                 'themes': themes,
+                 'themes_hit': themes_hit
                   }
                   }
-            m.update(f[3])
+            m.update(f[OTHER])
              hits.append(m)
  
          hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
  
              hits.append(m)
  
          hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
  
-        print("--- %s" % hits)
+        self._processed_hits = hits
  
          return hits
  
  
          return hits
  
@@ -620,13 +882,17 @@ class SearchResult(object):
              for r in rl:
                  if r.book_id in books:
                      books[r.book_id].merge(r)
              for r in rl:
                  if r.book_id in books:
                      books[r.book_id].merge(r)
-                    #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
                  else:
                      books[r.book_id] = r
          return books.values()
  
      def __cmp__(self, other):
                  else:
                      books[r.book_id] = r
          return books.values()
  
      def __cmp__(self, other):
-        return cmp(self.score, other.score)
+        c = cmp(self.score, other.score)
+        if c == 0:
+            # this is inverted, because earlier date is better
+            return cmp(other.published_date, self.published_date)
+        else:
+            return c
  
  
  class Hint(object):
  
  
  class Hint(object):
@@ -660,7 +926,7 @@ class Hint(object):
                  lst = self.book_tags.get(t.category, [])
                  lst.append(t)
                  self.book_tags[t.category] = lst
                  lst = self.book_tags.get(t.category, [])
                  lst.append(t)
                  self.book_tags[t.category] = lst
-            if t.category in ['theme']:
+            if t.category in ['theme', 'theme_pl']:
                  self.part_tags.append(t)
  
      def tag_filter(self, tags, field='tags'):
                  self.part_tags.append(t)
  
      def tag_filter(self, tags, field='tags'):
@@ -732,12 +998,31 @@ class Search(IndexStore):
          IndexStore.__init__(self)
          self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
          # self.analyzer = WLAnalyzer()
          IndexStore.__init__(self)
          self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
          # self.analyzer = WLAnalyzer()
-        self.searcher = IndexSearcher(self.store, True)
+        reader = IndexReader.open(self.store, True)
+        self.searcher = IndexSearcher(reader)
          self.parser = QueryParser(Version.LUCENE_34, default_field,
                                    self.analyzer)
  
          self.parent_filter = TermsFilter()
          self.parent_filter.addTerm(Term("is_book", "true"))
          self.parser = QueryParser(Version.LUCENE_34, default_field,
                                    self.analyzer)
  
          self.parent_filter = TermsFilter()
          self.parent_filter.addTerm(Term("is_book", "true"))
+        index_changed.connect(self.reopen)
+
+    def close(self):
+        reader = self.searcher.getIndexReader()
+        self.searcher.close()
+        reader.close()
+        super(Search, self).close()
+        index_changed.disconnect(self.reopen)
+
+    def reopen(self, **unused):
+        reader = self.searcher.getIndexReader()
+        rdr = reader.reopen()
+        if not rdr.equals(reader):
+            log.debug('Reopening index')
+            oldsearch = self.searcher
+            self.searcher = IndexSearcher(rdr)
+            oldsearch.close()
+            reader.close()
  
      def query(self, query):
          """Parse query in default Lucene Syntax. (for humans)
  
      def query(self, query):
          """Parse query in default Lucene Syntax. (for humans)
@@ -756,11 +1041,14 @@ class Search(IndexStore):
              bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
          return (bks, tops.totalHits)
  
              bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
          return (bks, tops.totalHits)
  
-    def get_tokens(self, searched, field='content'):
+    def get_tokens(self, searched, field='content', cached=None):
          """returns tokens analyzed by a proper (for a field) analyzer
          argument can be: StringReader, string/unicode, or tokens. In the last case
          they will just be returned (so we can reuse tokens, if we don't change the analyzer)
          """
          """returns tokens analyzed by a proper (for a field) analyzer
          argument can be: StringReader, string/unicode, or tokens. In the last case
          they will just be returned (so we can reuse tokens, if we don't change the analyzer)
          """
+        if cached is not None and field in cached:
+            return cached[field]
+
          if isinstance(searched, str) or isinstance(searched, unicode):
              searched = StringReader(searched)
          elif isinstance(searched, list):
          if isinstance(searched, str) or isinstance(searched, unicode):
              searched = StringReader(searched)
          elif isinstance(searched, list):
@@ -772,9 +1060,14 @@ class Search(IndexStore):
          while tokens.incrementToken():
              cta = tokens.getAttribute(CharTermAttribute.class_)
              toks.append(cta.toString())
          while tokens.incrementToken():
              cta = tokens.getAttribute(CharTermAttribute.class_)
              toks.append(cta.toString())
+
+        if cached is not None:
+            cached[field] = toks
+
          return toks
  
          return toks
  
-    def fuzziness(self, fuzzy):
+    @staticmethod
+    def fuzziness(fuzzy):
          """Helper method to sanitize fuzziness"""
          if not fuzzy:
              return None
          """Helper method to sanitize fuzziness"""
          if not fuzzy:
              return None
@@ -795,7 +1088,6 @@ class Search(IndexStore):
                  fuzzterms = []
  
                  while True:
                  fuzzterms = []
  
                  while True:
-                    #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
                      ft = fuzzterm.term()
                      if ft:
                          fuzzterms.append(ft)
                      ft = fuzzterm.term()
                      if ft:
                          fuzzterms.append(ft)
@@ -812,7 +1104,8 @@ class Search(IndexStore):
                  phrase.add(term)
          return phrase
  
                  phrase.add(term)
          return phrase
  
-    def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
+    @staticmethod
+    def make_term_query(tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
          """
          Returns term queries joined by boolean query.
          modal - applies to boolean query
          """
          Returns term queries joined by boolean query.
          modal - applies to boolean query
@@ -828,9 +1121,40 @@ class Search(IndexStore):
              q.add(BooleanClause(term, modal))
          return q
  
              q.add(BooleanClause(term, modal))
          return q
  
-    # def content_query(self, query):
-    #     return BlockJoinQuery(query, self.parent_filter,
-    #                           BlockJoinQuery.ScoreMode.Total)
+    def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
+                      filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
+        if filters is None: filters = []
+        if tokens_cache is None: tokens_cache = {}
+
+        tokens = self.get_tokens(searched, field, cached=tokens_cache)
+
+        query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
+        if book:
+            filters.append(self.term_filter(Term('is_book', 'true')))
+        top = self.searcher.search(query, self.chain_filters(filters), max_results)
+
+        return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
+
+    def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
+                    filters=None, tokens_cache=None, boost=None, snippets=True):
+        if filters is None: filters = []
+        if tokens_cache is None: tokens_cache = {}
+
+        if book:
+            filters.append(self.term_filter(Term('is_book', 'true')))
+
+        query = BooleanQuery()
+
+        for fld in fields:
+            tokens = self.get_tokens(searched, fld, cached=tokens_cache)
+
+            query.add(BooleanClause(self.make_term_query(tokens, field=fld,
+                                fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
+
+        top = self.searcher.search(query, self.chain_filters(filters), max_results)
+
+        return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
+                             snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
  
      def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
          """
  
      def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
          """
@@ -853,12 +1177,39 @@ class Search(IndexStore):
                  self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
                  max_results)
              for found in top.scoreDocs:
                  self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
                  max_results)
              for found in top.scoreDocs:
-                books.append(SearchResult(self.searcher, found))
+                books.append(SearchResult(self, found, how_found="search_perfect_book"))
+        return books
+
+    def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
+        fields_to_search = ['tags', 'authors', 'title']
+
+        only_in = None
+        if hint:
+            if not hint.should_search_for_book():
+                return []
+            fields_to_search = hint.just_search_in(fields_to_search)
+            only_in = hint.book_filter()
+
+        tokens = self.get_tokens(searched, field='SIMPLE')
+
+        q = BooleanQuery()
+
+        for fld in fields_to_search:
+            q.add(BooleanClause(self.make_term_query(tokens, field=fld,
+                                fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
+
+        books = []
+        top = self.searcher.search(q,
+                                   self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
+            max_results)
+        for found in top.scoreDocs:
+            books.append(SearchResult(self, found, how_found="search_book"))
+
          return books
  
      def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
          """
          return books
  
      def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
          """
-        Search for book parts which containt a phrase perfectly matching (with a slop of 2, default for make_phrase())
+        Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
          some part/fragment of the book.
          """
          qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
          some part/fragment of the book.
          """
          qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
@@ -874,16 +1225,17 @@ class Search(IndexStore):
                                                             flt]),
                                         max_results)
              for found in top.scoreDocs:
                                                             flt]),
                                         max_results)
              for found in top.scoreDocs:
-                books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q)))
+                books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
  
          return books
  
  
          return books
  
-    def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None):
+    def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
          """
          Tries to use search terms to match different fields of book (or its parts).
          E.g. one word can be an author survey, another be a part of the title, and the rest
          are some words from third chapter.
          """
          """
          Tries to use search terms to match different fields of book (or its parts).
          E.g. one word can be an author survey, another be a part of the title, and the rest
          are some words from third chapter.
          """
+        if tokens_cache is None: tokens_cache = {}
          books = []
          only_in = None
  
          books = []
          only_in = None
  
@@ -893,29 +1245,38 @@ class Search(IndexStore):
          # content only query : themes x content
          q = BooleanQuery()
  
          # content only query : themes x content
          q = BooleanQuery()
  
-        tokens = self.get_tokens(searched)
-        if hint is None or hint.just_search_in(['themes_pl']) != []:
-            q.add(BooleanClause(self.make_term_query(tokens, field='themes_pl',
+        tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
+        tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
+
+        # only search in themes when we do not already filter by themes
+        if hint is None or hint.just_search_in(['themes']) != []:
+            q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
                                                       fuzzy=fuzzy), BooleanClause.Occur.MUST))
  
                                                       fuzzy=fuzzy), BooleanClause.Occur.MUST))
  
-        q.add(BooleanClause(self.make_term_query(tokens, field='content',
+        q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
                                                   fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
  
          topDocs = self.searcher.search(q, only_in, max_results)
          for found in topDocs.scoreDocs:
                                                   fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
  
          topDocs = self.searcher.search(q, only_in, max_results)
          for found in topDocs.scoreDocs:
-            books.append(SearchResult(self.searcher, found))
+            books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
  
          # query themes/content x author/title/tags
          q = BooleanQuery()
  
          # query themes/content x author/title/tags
          q = BooleanQuery()
-        #        in_meta = BooleanQuery()
          in_content = BooleanQuery()
          in_content = BooleanQuery()
+        in_meta = BooleanQuery()
+
+        for fld in ['themes_pl', 'content']:
+            in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
  
  
-        for fld in ['themes', 'content', 'tags', 'authors', 'title']:
-            in_content.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
+        for fld in ['tags', 'authors', 'title']:
+            in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
+
+        q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
+        q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
  
          topDocs = self.searcher.search(q, only_in, max_results)
          for found in topDocs.scoreDocs:
  
          topDocs = self.searcher.search(q, only_in, max_results)
          for found in topDocs.scoreDocs:
-            books.append(SearchResult(self.searcher, found))
+            books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
  
          return books
  
  
          return books
  
@@ -961,7 +1322,6 @@ class Search(IndexStore):
  
          # return None
  
  
          # return None
  
-
      def get_snippets(self, scoreDoc, query, field='content'):
          """
          Returns a snippet for found scoreDoc.
      def get_snippets(self, scoreDoc, query, field='content'):
          """
          Returns a snippet for found scoreDoc.
@@ -971,19 +1331,40 @@ class Search(IndexStore):
  
          stored = self.searcher.doc(scoreDoc.doc)
  
  
          stored = self.searcher.doc(scoreDoc.doc)
  
+        position = stored.get('snippets_position')
+        length = stored.get('snippets_length')
+        if position is None or length is None:
+            return None
+        revision = stored.get('snippets_revision')
+        if revision: revision = int(revision)
+
          # locate content.
          # locate content.
-        snippets = Snippets(stored.get('book_id')).open()
+        book_id = int(stored.get('book_id'))
+        snippets = Snippets(book_id, revision=revision)
+
          try:
          try:
-            text = snippets.get((int(stored.get('snippets_position')),
-                                 int(stored.get('snippets_length'))))
-        finally:
-            snippets.close()
+            snippets.open()
+        except IOError, e:
+            log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
+            return []
  
  
-        tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
-        #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
-        #        import pdb; pdb.set_trace()
-        snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
+        try:
+            try:
+                text = snippets.get((int(position),
+                                     int(length)))
+            finally:
+                snippets.close()
+
+            tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
+            #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
+            snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
  
  
+        except Exception, e:
+            e2 = e
+            if hasattr(e, 'getJavaException'):
+                e2 = unicode(e.getJavaException())
+            raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
+                e2)
          return snip
  
      @staticmethod
          return snip
  
      @staticmethod
@@ -1003,33 +1384,56 @@ class Search(IndexStore):
          if terms:
              return JArray('object')(terms, Term)
  
          if terms:
              return JArray('object')(terms, Term)
  
-    def search_tags(self, query, filter=None, max_results=40):
+    def search_tags(self, query, filt=None, max_results=40, pdcounter=False):
          """
          Search for Tag objects using query.
          """
          """
          Search for Tag objects using query.
          """
-        tops = self.searcher.search(query, filter, max_results)
+        if not pdcounter:
+            filters = self.chain_filters([filt, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
+        tops = self.searcher.search(query, filt, max_results)
  
          tags = []
          for found in tops.scoreDocs:
              doc = self.searcher.doc(found.doc)
  
          tags = []
          for found in tops.scoreDocs:
              doc = self.searcher.doc(found.doc)
-            tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
-            tags.append(tag)
-            print "%s (%d) -> %f" % (tag, tag.id, found.score)
+            is_pdcounter = doc.get('is_pdcounter')
+            category = doc.get('tag_category')
+            try:
+                if is_pdcounter == 'true':
+                    if category == 'pd_author':
+                        tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
+                    elif category == 'pd_book':
+                        tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
+                        tag.category = 'pd_book'  # make it look more lik a tag.
+                    else:
+                        print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
+                else:
+                    tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
+                    # don't add the pdcounter tag if same tag already exists
+
+                tags.append(tag)
+
+            except catalogue.models.Tag.DoesNotExist: pass
+            except PDCounterAuthor.DoesNotExist: pass
+            except PDCounterBook.DoesNotExist: pass
+
+        log.debug('search_tags: %s' % tags)
  
          return tags
  
  
          return tags
  
-    def search_books(self, query, filter=None, max_results=10):
+    def search_books(self, query, filt=None, max_results=10):
          """
          Searches for Book objects using query
          """
          bks = []
          """
          Searches for Book objects using query
          """
          bks = []
-        tops = self.searcher.search(query, filter, max_results)
+        tops = self.searcher.search(query, filt, max_results)
          for found in tops.scoreDocs:
              doc = self.searcher.doc(found.doc)
          for found in tops.scoreDocs:
              doc = self.searcher.doc(found.doc)
-            bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
+            try:
+                bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
+            except catalogue.models.Book.DoesNotExist: pass
          return bks
  
          return bks
  
-    def create_prefix_phrase(self, toks, field):
+    def make_prefix_phrase(self, toks, field):
          q = MultiPhraseQuery()
          for i in range(len(toks)):
              t = Term(field, toks[i])
          q = MultiPhraseQuery()
          for i in range(len(toks)):
              t = Term(field, toks[i])
@@ -1055,7 +1459,7 @@ class Search(IndexStore):
  
          return only_term
  
  
          return only_term
  
-    def hint_tags(self, string, max_results=50):
+    def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
          """
          Return auto-complete hints for tags
          using prefix search.
          """
          Return auto-complete hints for tags
          using prefix search.
@@ -1064,14 +1468,17 @@ class Search(IndexStore):
          top = BooleanQuery()
  
          for field in ['tag_name', 'tag_name_pl']:
          top = BooleanQuery()
  
          for field in ['tag_name', 'tag_name_pl']:
-            q = self.create_prefix_phrase(toks, field)
+            if prefix:
+                q = self.make_prefix_phrase(toks, field)
+            else:
+                q = self.make_term_query(toks, field, fuzzy=fuzzy)
              top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
  
          no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
  
              top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
  
          no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
  
-        return self.search_tags(top, no_book_cat, max_results=max_results)
+        return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
  
  
-    def hint_books(self, string, max_results=50):
+    def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
          """
          Returns auto-complete hints for book titles
          Because we do not index 'pseudo' title-tags.
          """
          Returns auto-complete hints for book titles
          Because we do not index 'pseudo' title-tags.
@@ -1079,7 +1486,10 @@ class Search(IndexStore):
          """
          toks = self.get_tokens(string, field='SIMPLE')
  
          """
          toks = self.get_tokens(string, field='SIMPLE')
  
-        q = self.create_prefix_phrase(toks, 'title')
+        if prefix:
+            q = self.make_prefix_phrase(toks, 'title')
+        else:
+            q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
  
          return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
  
  
          return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
  
@@ -1089,7 +1499,7 @@ class Search(IndexStore):
          Chains a filter list together
          """
          filters = filter(lambda x: x is not None, filters)
          Chains a filter list together
          """
          filters = filter(lambda x: x is not None, filters)
-        if not filters:
+        if not filters or filters is []:
              return None
          chf = ChainedFilter(JArray('object')(filters, Filter), op)
          return chf
              return None
          chf = ChainedFilter(JArray('object')(filters, Filter), op)
          return chf