index with solr works.

[wolnelektury.git] / apps / search / index.py
diff --git a/apps/search/index.py b/apps/search/index.py

index 6883978..e7f28c9 100644 (file)
--- a/apps/search/index.py
+++ b/apps/search/index.py
@@ -1,26 +1,7 @@
  # -*- coding: utf-8 -*-
  
  from django.conf import settings
  # -*- coding: utf-8 -*-
  
  from django.conf import settings
-from django.dispatch import Signal
-from lucene import SimpleFSDirectory, NIOFSDirectory, IndexWriter, IndexReader, IndexWriterConfig, CheckIndex, \
-    File, Field, Integer, \
-    NumericField, Version, Document, JavaError, IndexSearcher, \
-    QueryParser, PerFieldAnalyzerWrapper, \
-    SimpleAnalyzer, PolishAnalyzer, ArrayList, \
-    KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
-    BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
-    HashSet, BooleanClause, Term, CharTermAttribute, \
-    PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
-    FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
-    SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
-    BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
-    initVM, CLASSPATH, JArray, JavaError
-    # KeywordAnalyzer
-
-# Initialize jvm
-JVM = initVM(CLASSPATH)
-
-import sys
+
  import os
  import re
  import errno
  import os
  import re
  import errno
@@ -29,84 +10,17 @@ from librarian.parser import WLDocument
  from lxml import etree
  import catalogue.models
  from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  from lxml import etree
  import catalogue.models
  from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
-from multiprocessing.pool import ThreadPool
-from threading import current_thread
  from itertools import chain
  from itertools import chain
-import atexit
  import traceback
  import logging
  log = logging.getLogger('search')
  import traceback
  import logging
  log = logging.getLogger('search')
-
-class WLAnalyzer(PerFieldAnalyzerWrapper):
-    def __init__(self):
-        polish = PolishAnalyzer(Version.LUCENE_34)
-        #        polish_gap.setPositionIncrementGap(999)
-
-        simple = SimpleAnalyzer(Version.LUCENE_34)
-        #        simple_gap.setPositionIncrementGap(999)
-
-        keyword = KeywordAnalyzer(Version.LUCENE_34)
-
-        # not sure if needed: there's NOT_ANALYZED meaning basically the same
-
-        PerFieldAnalyzerWrapper.__init__(self, polish)
-
-        self.addAnalyzer("tags", simple)
-        self.addAnalyzer("technical_editors", simple)
-        self.addAnalyzer("editors", simple)
-        self.addAnalyzer("url", keyword)
-        self.addAnalyzer("source_url", keyword)
-        self.addAnalyzer("source_name", simple)
-        self.addAnalyzer("publisher", simple)
-        self.addAnalyzer("authors", simple)
-        self.addAnalyzer("title", simple)
-
-        self.addAnalyzer("is_book", keyword)
-        # shouldn't the title have two forms? _pl and simple?
-
-        self.addAnalyzer("themes", simple)
-        self.addAnalyzer("themes_pl", polish)
-
-        self.addAnalyzer("tag_name", simple)
-        self.addAnalyzer("tag_name_pl", polish)
-
-        self.addAnalyzer("translators", simple)
-
-        self.addAnalyzer("KEYWORD", keyword)
-        self.addAnalyzer("SIMPLE", simple)
-        self.addAnalyzer("POLISH", polish)
-
-
-class IndexStore(object):
-    """
-    Provides access to search index.
-
-    self.store - lucene index directory
-    """
-    def __init__(self):
-        self.make_index_dir()
-        self.store = NIOFSDirectory(File(settings.SEARCH_INDEX))
-
-    def make_index_dir(self):
-        try:
-            os.makedirs(settings.SEARCH_INDEX)
-        except OSError as exc:
-            if exc.errno == errno.EEXIST:
-                pass
-            else: raise
-
-    def close(self):
-        self.store.close()
+import sunburnt
+import highlight
  
  
  
  
-class IndexChecker(IndexStore):
-    def __init__(self):
-        IndexStore.__init__(self)
-
-    def check(self):
-        checker = CheckIndex(self.store)
-        status = checker.checkIndex()
-        return status
+class SolrIndex(object):
+    def __init__(self, mode=None):
+        self.index = highlight.HLSolrInterface(settings.SOLR, mode=mode)
  
  
  class Snippets(object):
  
  
  class Snippets(object):
@@ -191,60 +105,38 @@ class Snippets(object):
              pass
  
  
              pass
  
  
-class BaseIndex(IndexStore):
-    """
-    Base index class.
-    Provides basic operations on index: opening, closing, optimizing.
-    """
-    def __init__(self, analyzer=None):
-        super(BaseIndex, self).__init__()
-        self.index = None
-        if not analyzer:
-            analyzer = WLAnalyzer()
-        self.analyzer = analyzer
-
-    def open(self, timeout=None):
-        if self.index:
-            raise Exception("Index is already opened")
-        conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer)
-        if timeout:
-            conf.setWriteLockTimeout(long(timeout))
-        self.index = IndexWriter(self.store, conf)
-        return self.index
-
-    def optimize(self):
-        self.index.optimize()
-
-    def close(self):
-        try:
-            self.index.optimize()
-        except JavaError, je:
-            log.error("Error during optimize phase, check index: %s" % je)
-
-        self.index.close()
-        self.index = None
-
-        index_changed.send_robust(self)
-
-        super(BaseIndex, self).close()
-
-    def __enter__(self):
-        self.open()
-        return self
-
-    def __exit__(self, type, value, tb):
-        self.close()
-
-
-index_changed = Signal()
-
-
-class Index(BaseIndex):
+class Index(SolrIndex):
      """
      Class indexing books.
      """
      """
      Class indexing books.
      """
-    def __init__(self, analyzer=None):
-        super(Index, self).__init__(analyzer)
+    def __init__(self):
+        super(Index, self).__init__()
+
+    def delete_query(self, *queries):
+        """
+        index.delete(queries=...) doesn't work, so let's reimplement it
+        using deletion of list of uids.
+        """
+        uids = set()
+        for q in queries:
+            if isinstance(q, sunburnt.search.LuceneQuery):
+                q = self.index.query(q)
+            q.field_limiter.update(['uid'])
+            st = 0
+            rows = 100
+            while True:
+                ids = q.paginate(start=st, rows=rows).execute()
+                if not len(ids):
+                    break
+                for res in ids:
+                    uids.add(res['uid'])
+                st+=rows
+                #        print "Will delete %s" % ','.join([x for x in uids])
+        if uids:
+            self.index.delete(uids)
+            return True
+        else:
+            return False
  
      def index_tags(self, *tags, **kw):
          """
  
      def index_tags(self, *tags, **kw):
          """
@@ -255,25 +147,23 @@ class Index(BaseIndex):
          remove_only = kw.get('remove_only', False)
          # first, remove tags from index.
          if tags:
          remove_only = kw.get('remove_only', False)
          # first, remove tags from index.
          if tags:
-            q = BooleanQuery()
+            tag_qs = []
              for tag in tags:
              for tag in tags:
-                b_id_cat = BooleanQuery()
-
-                q_id = NumericRangeQuery.newIntRange("tag_id", tag.id, tag.id, True, True)
-                b_id_cat.add(q_id, BooleanClause.Occur.MUST)
+                q_id = self.index.Q(tag_id=tag.id)
  
                  if isinstance(tag, PDCounterAuthor):
  
                  if isinstance(tag, PDCounterAuthor):
-                    q_cat = TermQuery(Term('tag_category', 'pd_author'))
+                    q_cat = self.index.Q(tag_category='pd_author')
                  elif isinstance(tag, PDCounterBook):
                  elif isinstance(tag, PDCounterBook):
-                    q_cat = TermQuery(Term('tag_category', 'pd_book'))
+                    q_cat = self.index.Q(tag_category='pd_book')
                  else:
                  else:
-                    q_cat = TermQuery(Term('tag_category', tag.category))
-                b_id_cat.add(q_cat, BooleanClause.Occur.MUST)
+                    q_cat = self.index.Q(tag_category=tag.category)
  
  
-                q.add(b_id_cat, BooleanClause.Occur.SHOULD)
+                q_id_cat = self.index.Q(q_id & q_cat)
+                tag_qs.append(q_id_cat)
+            self.delete_query(tag_qs)
          else:  # all
          else:  # all
-            q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
-            self.index.deleteDocuments(q)
+            q = self.index.Q(tag_id__any=True)
+            self.delete_query(q)
  
          if not remove_only:
              # then add them [all or just one passed]
  
          if not remove_only:
              # then add them [all or just one passed]
@@ -284,37 +174,41 @@ class Index(BaseIndex):
  
              for tag in tags:
                  if isinstance(tag, PDCounterAuthor):
  
              for tag in tags:
                  if isinstance(tag, PDCounterAuthor):
-                    doc = Document()
-                    doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
-                    doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
-                    doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
-                    doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED))
-                    doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
-                    self.index.addDocument(doc)
+                    doc = {
+                        "tag_id": int(tag.id),
+                        "tag_name": tag.name,
+                        "tag_name_pl": tag.name,
+                        "tag_category": 'pd_author',
+                        "is_pdcounter": True
+                        }
                  elif isinstance(tag, PDCounterBook):
                  elif isinstance(tag, PDCounterBook):
-                    doc = Document()
-                    doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
-                    doc.add(Field("tag_name", tag.title, Field.Store.NO, Field.Index.ANALYZED))
-                    doc.add(Field("tag_name_pl", tag.title, Field.Store.NO, Field.Index.ANALYZED))
-                    doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED))
-                    doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
-                    self.index.addDocument(doc)
+                    doc = {
+                        "tag_id": int(tag.id),
+                        "tag_name": tag.title,
+                        "tag_name_pl": tag.title,
+                        "tag_category": 'pd_book',
+                        "is_pdcounter": True
+                        }
                  else:
                  else:
-                    doc = Document()
-                    doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
-                    doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
-                    doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
-                    doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
-                    self.index.addDocument(doc)
+                    doc = {
+                        "tag_id": int(tag.id),
+                        "tag_name": tag.name,
+                        "tag_name_pl": tag.name,
+                        "tag_category": tag.category,
+                        "is_pdcounter": False
+                        }
+                doc['uid'] = "tag%d" % tag.id
+                self.index.add(doc)
  
      def create_book_doc(self, book):
          """
          Create a lucene document referring book id.
          """
  
      def create_book_doc(self, book):
          """
          Create a lucene document referring book id.
          """
-        doc = Document()
-        doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
+        doc = {
+            'book_id': int(book.id),
+            }
          if book.parent is not None:
          if book.parent is not None:
-            doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
+            doc["parent_id"] = int(book.parent.id)
          return doc
  
      def remove_book(self, book_or_id, remove_snippets=True):
          return doc
  
      def remove_book(self, book_or_id, remove_snippets=True):
@@ -325,8 +219,7 @@ class Index(BaseIndex):
          else:
              book_id = book_or_id
  
          else:
              book_id = book_or_id
  
-        q = NumericRangeQuery.newIntRange("book_id", book_id, book_id, True, True)
-        self.index.deleteDocuments(q)
+        self.delete_query(self.index.Q(book_id=book_id))
  
          if remove_snippets:
              snippets = Snippets(book_id)
  
          if remove_snippets:
              snippets = Snippets(book_id)
@@ -346,18 +239,20 @@ class Index(BaseIndex):
          book_doc = self.create_book_doc(book)
          meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
          # let's not index it - it's only used for extracting publish date
          book_doc = self.create_book_doc(book)
          meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
          # let's not index it - it's only used for extracting publish date
-        del meta_fields['source_name']
-        
-        for f in meta_fields.values():
-            if isinstance(f, list) or isinstance(f, tuple):
-                for elem in f:
-                    book_doc.add(elem)
-            else:
-                book_doc.add(f)
-        self.index.addDocument(book_doc)
+        if 'source_name' in meta_fields:
+            del meta_fields['source_name']
+
+        for n, f in meta_fields.items():
+            book_doc[n] = f
+
+        book_doc['uid'] = "book%s" % book_doc['book_id']
+        self.index.add(book_doc)
          del book_doc
  
          del book_doc
  
-        self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
+        self.index_content(book, book_fields={
+            'title': meta_fields['title'],
+            'authors': meta_fields['authors'],
+            'published_date': meta_fields['published_date']})
  
      master_tags = [
          'opowiadanie',
  
      master_tags = [
          'opowiadanie',
@@ -390,9 +285,9 @@ class Index(BaseIndex):
          if book_info is None:
              book_info = dcparser.parse(open(book.xml_file.path))
  
          if book_info is None:
              book_info = dcparser.parse(open(book.xml_file.path))
  
-        fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
-        fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
-        fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
+        fields['slug'] = book.slug
+        fields['tags'] = [t.name  for t in book.tags]
+        fields['is_book'] = True
  
          # validator, name
          for field in dcparser.BookInfo.FIELDS:
  
          # validator, name
          for field in dcparser.BookInfo.FIELDS:
@@ -407,21 +302,17 @@ class Index(BaseIndex):
                      s = getattr(book_info, field.name)
                      if field.multiple:
                          s = ', '.join(s)
                      s = getattr(book_info, field.name)
                      if field.multiple:
                          s = ', '.join(s)
-                    try:
-                        fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
-                    except JavaError as je:
-                        raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
+                    fields[field.name] = s
                  elif type_indicator == dcparser.as_person:
                      p = getattr(book_info, field.name)
                      if isinstance(p, dcparser.Person):
                          persons = unicode(p)
                      else:
                          persons = ', '.join(map(unicode, p))
                  elif type_indicator == dcparser.as_person:
                      p = getattr(book_info, field.name)
                      if isinstance(p, dcparser.Person):
                          persons = unicode(p)
                      else:
                          persons = ', '.join(map(unicode, p))
-                    fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
+                    fields[field.name] = persons
                  elif type_indicator == dcparser.as_date:
                      dt = getattr(book_info, field.name)
                  elif type_indicator == dcparser.as_date:
                      dt = getattr(book_info, field.name)
-                    fields[field.name] = Field(field.name, "%04d%02d%02d" %\
-                                               (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
+                    fields[field.name] = dt
  
          # get published date
          pd = None
  
          # get published date
          pd = None
@@ -430,19 +321,19 @@ class Index(BaseIndex):
              if match is not None:
                  pd = str(match.groups()[0])
          if not pd: pd = ""
              if match is not None:
                  pd = str(match.groups()[0])
          if not pd: pd = ""
-        fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
+        fields["published_date"] = pd
  
          return fields
  
  
          return fields
  
-    def add_gaps(self, fields, fieldname):
-        """
-        Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
-        This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
-        """
-        def gap():
-            while True:
-                yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
-        return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
+    # def add_gaps(self, fields, fieldname):
+    #     """
+    #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
+    #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
+    #     """
+    #     def gap():
+    #         while True:
+    #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
+    #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
  
      def get_master(self, root):
          """
  
      def get_master(self, root):
          """
@@ -452,7 +343,7 @@ class Index(BaseIndex):
              if master.tag in self.master_tags:
                  return master
  
              if master.tag in self.master_tags:
                  return master
  
-    def index_content(self, book, book_fields=[]):
+    def index_content(self, book, book_fields={}):
          """
          Walks the book XML and extract content from it.
          Adds parts for each header tag and for each fragment.
          """
          Walks the book XML and extract content from it.
          Adds parts for each header tag and for each fragment.
@@ -495,41 +386,31 @@ class Index(BaseIndex):
  
          def add_part(snippets, **fields):
              doc = self.create_book_doc(book)
  
          def add_part(snippets, **fields):
              doc = self.create_book_doc(book)
-            for f in book_fields:
-                doc.add(f)
+            for n, v in book_fields.items():
+                doc[n] = v
  
  
-            doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
-            doc.add(NumericField("header_span", Field.Store.YES, True)\
-                    .setIntValue('header_span' in fields and fields['header_span'] or 1))
-            doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
+            doc['header_index'] = fields["header_index"]
+            doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
+            doc['header_type'] = fields['header_type']
  
  
-            doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
-                          Field.TermVector.WITH_POSITIONS_OFFSETS))
+            doc['text'] = fields['text']
  
  
-            snip_pos = snippets.add(fields["content"])
-            doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
-            doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
+            # snippets
+            snip_pos = snippets.add(fields["text"])
+
+            doc['snippets_position'] = snip_pos[0]
+            doc['snippets_length'] = snip_pos[1]
              if snippets.revision:
              if snippets.revision:
-                doc.add(NumericField("snippets_revision", Field.Store.YES, True).setIntValue(snippets.revision))
+                doc["snippets_revision"] = snippets.revision
  
              if 'fragment_anchor' in fields:
  
              if 'fragment_anchor' in fields:
-                doc.add(Field("fragment_anchor", fields['fragment_anchor'],
-                              Field.Store.YES, Field.Index.NOT_ANALYZED))
+                doc["fragment_anchor"] = fields['fragment_anchor']
  
              if 'themes' in fields:
  
              if 'themes' in fields:
-                themes, themes_pl = zip(*[
-                    (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
-                     Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
-                     for theme in fields['themes']])
-
-                themes = self.add_gaps(themes, 'themes')
-                themes_pl = self.add_gaps(themes_pl, 'themes_pl')
-
-                for t in themes:
-                    doc.add(t)
-                for t in themes_pl:
-                    doc.add(t)
-
+                doc['themes'] = fields['themes']
+            doc['uid'] = "part%s%s%s" % (doc['header_index'],
+                                         doc['header_span'],
+                                         doc.get('fragment_anchor',''))
              return doc
  
          def give_me_utf8(s):
              return doc
  
          def give_me_utf8(s):
@@ -554,39 +435,40 @@ class Index(BaseIndex):
  
                  def all_content(text):
                      for frag in fragments.values():
  
                  def all_content(text):
                      for frag in fragments.values():
-                        frag['content'].append(text)
+                        frag['text'].append(text)
                      content.append(text)
                  handle_text = [all_content]
  
                      content.append(text)
                  handle_text = [all_content]
  
-
                  for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
                      # handle footnotes
                      if start is not None and start.tag in self.footnote_tags:
                          footnote = []
                  for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
                      # handle footnotes
                      if start is not None and start.tag in self.footnote_tags:
                          footnote = []
+
                          def collect_footnote(t):
                              footnote.append(t)
                          def collect_footnote(t):
                              footnote.append(t)
+
                          handle_text.append(collect_footnote)
                      elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
                          handle_text.pop()
                          doc = add_part(snippets, header_index=position, header_type=header.tag,
                          handle_text.append(collect_footnote)
                      elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
                          handle_text.pop()
                          doc = add_part(snippets, header_index=position, header_type=header.tag,
-                                       content=u''.join(footnote),
-                                       is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
-                
-                        self.index.addDocument(doc)
+                                       text=u''.join(footnote),
+                                       is_footnote=True)
+
+                        self.index.add(doc)
                          #print "@ footnote text: %s" % footnote
                          footnote = []
                          #print "@ footnote text: %s" % footnote
                          footnote = []
-                    
+
                      # handle fragments and themes.
                      if start is not None and start.tag == 'begin':
                          fid = start.attrib['id'][1:]
                      # handle fragments and themes.
                      if start is not None and start.tag == 'begin':
                          fid = start.attrib['id'][1:]
-                        fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
+                        fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
  
                      # themes for this fragment
                      elif start is not None and start.tag == 'motyw':
                          fid = start.attrib['id'][1:]
                          handle_text.append(None)
                          if start.text is not None:
  
                      # themes for this fragment
                      elif start is not None and start.tag == 'motyw':
                          fid = start.attrib['id'][1:]
                          handle_text.append(None)
                          if start.text is not None:
-                            fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
+                            fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
                      elif end is not None and end.tag == 'motyw':
                          handle_text.pop()
  
                      elif end is not None and end.tag == 'motyw':
                          handle_text.pop()
  
@@ -604,10 +486,10 @@ class Index(BaseIndex):
                                         header_index=frag['start_section'],
                                         header_span=position - frag['start_section'] + 1,
                                         fragment_anchor=fid,
                                         header_index=frag['start_section'],
                                         header_span=position - frag['start_section'] + 1,
                                         fragment_anchor=fid,
-                                       content=fix_format(frag['content']),
+                                       text=fix_format(frag['text']),
                                         themes=frag['themes'])
                          #print '@ FRAG %s' % frag['content']
                                         themes=frag['themes'])
                          #print '@ FRAG %s' % frag['content']
-                        self.index.addDocument(doc)
+                        self.index.add(doc)
  
                          # Collect content.
  
  
                          # Collect content.
  
@@ -617,141 +499,51 @@ class Index(BaseIndex):
                              hdl(text)
  
                          # in the end, add a section text.
                              hdl(text)
  
                          # in the end, add a section text.
-                doc = add_part(snippets, header_index=position, header_type=header.tag,
-                               content=fix_format(content))
+                doc = add_part(snippets, header_index=position,
+                               header_type=header.tag, text=fix_format(content))
                  #print '@ CONTENT: %s' % fix_format(content)
  
                  #print '@ CONTENT: %s' % fix_format(content)
  
-                self.index.addDocument(doc)
+                self.index.add(doc)
  
          finally:
              snippets.close()
  
  
  
          finally:
              snippets.close()
  
  
-def log_exception_wrapper(f):
-    def _wrap(*a):
-        try:
-            f(*a)
-        except Exception, e:
-            log.error("Error in indexing thread: %s" % e)
-            traceback.print_exc()
-            raise e
-    return _wrap
-
-
-class ReusableIndex(Index):
-    """
-    Works like index, but does not close/optimize Lucene index
-    until program exit (uses atexit hook).
-    This is usefull for importbooks command.
-
-    if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
-    """
-    index = None
-
-    def open(self, analyzer=None, **kw):
-        if ReusableIndex.index:
-            self.index = ReusableIndex.index
-        else:
-            Index.open(self, analyzer, **kw)
-            ReusableIndex.index = self.index
-            atexit.register(ReusableIndex.close_reusable)
-
-    # def index_book(self, *args, **kw):
-    #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
-    #     ReusableIndex.pool_jobs.append(job)
-
-    @staticmethod
-    def close_reusable():
-        if ReusableIndex.index:
-            ReusableIndex.index.optimize()
-            ReusableIndex.index.close()
-            ReusableIndex.index = None
-
-            index_changed.send_robust(None)
-
-    def close(self):
-        if ReusableIndex.index:
-            ReusableIndex.index.commit()
-
-
-class JoinSearch(object):
-    """
-    This mixin could be used to handle block join queries.
-    (currently unused)
-    """
-    def __init__(self, *args, **kw):
-        super(JoinSearch, self).__init__(*args, **kw)
-
-    def wrapjoins(self, query, fields=[]):
-        """
-        This functions modifies the query in a recursive way,
-        so Term and Phrase Queries contained, which match
-        provided fields are wrapped in a BlockJoinQuery,
-        and so delegated to children documents.
-        """
-        if BooleanQuery.instance_(query):
-            qs = BooleanQuery.cast_(query)
-            for clause in qs:
-                clause = BooleanClause.cast_(clause)
-                clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
-            return qs
-        else:
-            termset = HashSet()
-            query.extractTerms(termset)
-            for t in termset:
-                t = Term.cast_(t)
-                if t.field() not in fields:
-                    return query
-            return BlockJoinQuery(query, self.parent_filter,
-                                  BlockJoinQuery.ScoreMode.Total)
-
-    def bsearch(self, query, max_results=50):
-        q = self.query(query)
-        bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
-
-        tops = self.searcher.search(bjq, max_results)
-        bks = []
-        for found in tops.scoreDocs:
-            doc = self.searcher.doc(found.doc)
-            bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
-        return (bks, tops.totalHits)
-
  
  class SearchResult(object):
  
  class SearchResult(object):
-    def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
+    def __init__(self, search, doc, how_found=None, snippets=None, searched=None, tokens_cache=None):
          if tokens_cache is None: tokens_cache = {}
  
          if tokens_cache is None: tokens_cache = {}
  
-        if score:
-            self._score = score
+        if 'score' in doc:
+            self._score = doc['score']
          else:
          else:
-            self._score = scoreDocs.score
+            self._score = 0
  
          self.boost = 1.0
  
          self._hits = []
          self._processed_hits = None  # processed hits
  
  
          self.boost = 1.0
  
          self._hits = []
          self._processed_hits = None  # processed hits
  
-        stored = search.searcher.doc(scoreDocs.doc)
-        self.book_id = int(stored.get("book_id"))
+        self.book_id = int(doc["book_id"])
  
  
-        pd = stored.get("published_date")
+        pd = doc["published_date"]
          try:
              self.published_date = int(pd)
          except ValueError:
              self.published_date = 0
  
          try:
              self.published_date = int(pd)
          except ValueError:
              self.published_date = 0
  
-        header_type = stored.get("header_type")
+        header_type = doc.get("header_type", None)
          # we have a content hit in some header of fragment
          if header_type is not None:
          # we have a content hit in some header of fragment
          if header_type is not None:
-            sec = (header_type, int(stored.get("header_index")))
-            header_span = stored.get('header_span')
+            sec = (header_type, int(doc["header_index"]))
+            header_span = doc['header_span']
              header_span = header_span is not None and int(header_span) or 1
  
              header_span = header_span is not None and int(header_span) or 1
  
-            fragment = stored.get("fragment_anchor")
+            fragment = doc.get("fragment_anchor", None)
  
              if snippets:
                  snippets = snippets.replace("/\n", "\n")
  
              if snippets:
                  snippets = snippets.replace("/\n", "\n")
-            hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
+            hit = (sec + (header_span,), fragment, self._score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
  
              self._hits.append(hit)
  
  
              self._hits.append(hit)
  
@@ -989,11 +781,11 @@ class Hint(object):
          return some
  
  
          return some
  
  
-class Search(IndexStore):
+class Search(SolrIndex):
      """
      Search facilities.
      """
      """
      Search facilities.
      """
-    def __init__(self, default_field="content"):
+    def __init__(self, default_field="text"):
          IndexStore.__init__(self)
          self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
          # self.analyzer = WLAnalyzer()
          IndexStore.__init__(self)
          self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
          # self.analyzer = WLAnalyzer()
@@ -1040,7 +832,7 @@ class Search(IndexStore):
              bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
          return (bks, tops.totalHits)
  
              bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
          return (bks, tops.totalHits)
  
-    def get_tokens(self, searched, field='content', cached=None):
+    def get_tokens(self, searched, field='text', cached=None):
          """returns tokens analyzed by a proper (for a field) analyzer
          argument can be: StringReader, string/unicode, or tokens. In the last case
          they will just be returned (so we can reuse tokens, if we don't change the analyzer)
          """returns tokens analyzed by a proper (for a field) analyzer
          argument can be: StringReader, string/unicode, or tokens. In the last case
          they will just be returned (so we can reuse tokens, if we don't change the analyzer)
@@ -1066,7 +858,7 @@ class Search(IndexStore):
          return toks
  
      @staticmethod
          return toks
  
      @staticmethod
-    def fuzziness(self, fuzzy):
+    def fuzziness(fuzzy):
          """Helper method to sanitize fuzziness"""
          if not fuzzy:
              return None
          """Helper method to sanitize fuzziness"""
          if not fuzzy:
              return None
@@ -1075,7 +867,7 @@ class Search(IndexStore):
          else:
              return 0.5
  
          else:
              return 0.5
  
-    def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
+    def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
          """
          Return a PhraseQuery with a series of tokens.
          """
          """
          Return a PhraseQuery with a series of tokens.
          """
@@ -1104,7 +896,7 @@ class Search(IndexStore):
          return phrase
  
      @staticmethod
          return phrase
  
      @staticmethod
-    def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
+    def make_term_query(tokens, field='text', modal='BooleanClause.Occur.SHOULD XXX', fuzzy=False):
          """
          Returns term queries joined by boolean query.
          modal - applies to boolean query
          """
          Returns term queries joined by boolean query.
          modal - applies to boolean query
@@ -1211,7 +1003,7 @@ class Search(IndexStore):
          Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
          some part/fragment of the book.
          """
          Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
          some part/fragment of the book.
          """
-        qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
+        qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
  
          flt = None
          if hint:
  
          flt = None
          if hint:
@@ -1244,7 +1036,7 @@ class Search(IndexStore):
          # content only query : themes x content
          q = BooleanQuery()
  
          # content only query : themes x content
          q = BooleanQuery()
  
-        tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
+        tokens_pl = self.get_tokens(searched, field='text', cached=tokens_cache)
          tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
  
          # only search in themes when we do not already filter by themes
          tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
  
          # only search in themes when we do not already filter by themes
@@ -1252,7 +1044,7 @@ class Search(IndexStore):
              q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
                                                       fuzzy=fuzzy), BooleanClause.Occur.MUST))
  
              q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
                                                       fuzzy=fuzzy), BooleanClause.Occur.MUST))
  
-        q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
+        q.add(BooleanClause(self.make_term_query(tokens_pl, field='text',
                                                   fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
  
          topDocs = self.searcher.search(q, only_in, max_results)
                                                   fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
  
          topDocs = self.searcher.search(q, only_in, max_results)
@@ -1264,7 +1056,7 @@ class Search(IndexStore):
          in_content = BooleanQuery()
          in_meta = BooleanQuery()
  
          in_content = BooleanQuery()
          in_meta = BooleanQuery()
  
-        for fld in ['themes_pl', 'content']:
+        for fld in ['themes_pl', 'text']:
              in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
  
          for fld in ['tags', 'authors', 'title']:
              in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
  
          for fld in ['tags', 'authors', 'title']:
@@ -1321,7 +1113,7 @@ class Search(IndexStore):
  
          # return None
  
  
          # return None
  
-    def get_snippets(self, scoreDoc, query, field='content'):
+    def get_snippets(self, scoreDoc, query, field='text'):
          """
          Returns a snippet for found scoreDoc.
          """
          """
          Returns a snippet for found scoreDoc.
          """
@@ -1408,8 +1200,9 @@ class Search(IndexStore):
                  else:
                      tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
                      # don't add the pdcounter tag if same tag already exists
                  else:
                      tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
                      # don't add the pdcounter tag if same tag already exists
-                if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
-                    tags.append(tag)
+
+                tags.append(tag)
+
              except catalogue.models.Tag.DoesNotExist: pass
              except PDCounterAuthor.DoesNotExist: pass
              except PDCounterBook.DoesNotExist: pass
              except catalogue.models.Tag.DoesNotExist: pass
              except PDCounterAuthor.DoesNotExist: pass
              except PDCounterBook.DoesNotExist: pass
@@ -1492,7 +1285,7 @@ class Search(IndexStore):
          return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
  
      @staticmethod
          return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
  
      @staticmethod
-    def chain_filters(filters, op=ChainedFilter.AND):
+    def chain_filters(filters, op='XXXChainedFilter.AND'):
          """
          Chains a filter list together
          """
          """
          Chains a filter list together
          """