# -*- coding: utf-8 -*-
+
from django.conf import settings
from lucene import SimpleFSDirectory, IndexWriter, File, Field, \
NumericField, Version, Document, JavaError, IndexSearcher, \
BlockJoinQuery, BlockJoinCollector, TermsFilter, \
HashSet, BooleanClause, Term, CharTermAttribute, \
PhraseQuery, StringReader, TermQuery, BlockJoinQuery, \
- Sort, Integer
+ Sort, Integer, \
+ initVM, CLASSPATH
# KeywordAnalyzer
+JVM = initVM(CLASSPATH)
import sys
import os
import errno
from multiprocessing.pool import ThreadPool
from threading import current_thread
import atexit
-
+import traceback
class WLAnalyzer(PerFieldAnalyzerWrapper):
def __init__(self):
if overwrite:
self.remove_book(book)
-
doc = self.extract_metadata(book)
parts = self.extract_content(book)
block = ArrayList().of_(Document)
doc = self.create_book_doc(book)
doc.add(Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS))
- doc.add(Field("tags", ','.join([t.name for t in book.tags]), Field.Store.NO, Field.Index.ANALYZED))
+ #doc.add(Field("tags", ','.join([t.name for t in book.tags]), Field.Store.NO, Field.Index.ANALYZED))
doc.add(Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
# validator, name
self.close()
+def log_exception_wrapper(f):
+ def _wrap(*a):
+ try:
+ f(*a)
+ except Exception, e:
+ print("Error in indexing thread: %s" % e)
+ traceback.print_exc()
+ raise e
+ return _wrap
+
+
class ReusableIndex(Index):
"""
Works like index, but does not close/optimize Lucene index
self.index = ReusableIndex.index
else:
print("opening index")
- ReusableIndex.pool = ThreadPool(threads)
+ ReusableIndex.pool = ThreadPool(threads, initializer=lambda: JVM.attachCurrentThread() )
ReusableIndex.pool_jobs = []
Index.open(self, analyzer)
ReusableIndex.index = self.index
atexit.register(ReusableIndex.close_reusable)
def index_book(self, *args, **kw):
- job = ReusableIndex.pool.apply_async(Index.index_book, (self,)+ args, kw)
+ job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
ReusableIndex.pool_jobs.append(job)
@staticmethod
def close_reusable():
if ReusableIndex.index is not None:
- print("closing index")
+ print("wait for indexing to finish")
for job in ReusableIndex.pool_jobs:
- job.wait()
+ job.get()
+ sys.stdout.write('.')
+ sys.stdout.flush()
+ print("done.")
ReusableIndex.pool.close()
ReusableIndex.index.optimize()