doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
return doc
- def remove_book(self, book, remove_snippets=True):
+ def remove_book(self, book_or_id, remove_snippets=True):
"""Removes a book from search index.
book - Book instance."""
- q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
+ if isinstance(book_or_id, catalogue.models.Book):
+ book_id = book_or_id.id
+ else:
+ book_id = book_or_id
+
+ q = NumericRangeQuery.newIntRange("book_id", book_id, book_id, True, True)
self.index.deleteDocuments(q)
if remove_snippets:
- snippets = Snippets(book.id)
+ snippets = Snippets(book_id)
snippets.remove()
def index_book(self, book, book_info=None, overwrite=True):
self.remove_book(book, remove_snippets=False)
book_doc = self.create_book_doc(book)
- meta_fields = self.extract_metadata(book, book_info)
+ meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
+ # let's not index it - it's only used for extracting publish date
+ if 'source_name' in meta_fields:
+ del meta_fields['source_name']
+
for f in meta_fields.values():
if isinstance(f, list) or isinstance(f, tuple):
for elem in f:
published_date_re = re.compile("([0-9]+)[\]. ]*$")
- def extract_metadata(self, book, book_info=None):
+ def extract_metadata(self, book, book_info=None, dc_only=None):
"""
Extract metadata from book and returns a map of fields keyed by fieldname
"""
# validator, name
for field in dcparser.BookInfo.FIELDS:
+ if dc_only and field.name not in dc_only:
+ continue
if hasattr(book_info, field.name):
if not getattr(book_info, field.name):
continue
return toks
- def fuzziness(self, fuzzy):
+ @staticmethod
+ def fuzziness(fuzzy):
"""Helper method to sanitize fuzziness"""
if not fuzzy:
return None
phrase.add(term)
return phrase
- def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
+ @staticmethod
+ def make_term_query(tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
"""
Returns term queries joined by boolean query.
modal - applies to boolean query
from django.core.management.base import BaseCommand
-from search import Index
+from search import Index, Search
+from lucene import IndexReader, IndexSearcher, Term
+from catalogue.models import Book
+
class Command(BaseCommand):
help = 'Optimize Lucene search index'
args = ''
+ def delete_old(self, index):
+ existing_ids = set([book.id for book in Book.objects.all()])
+
+ reader = IndexReader.open(index.index, False)
+ searcher = IndexSearcher(reader)
+ try:
+ num = searcher.docFreq(Term('is_book', 'true'))
+ docs = searcher.search(Search.make_term_query(['true'], 'is_book'), num)
+ for result in docs.scoreDocs:
+ stored = searcher.doc(result.doc)
+ book_id = int(stored.get('book_id'))
+ if not book_id in existing_ids:
+ print "book id %d doesn't exist." % book_id
+ index.remove_book(book_id)
+ finally:
+ searcher.close()
+ reader.close()
+
def handle(self, *args, **opts):
index = Index()
index.open()
+
+ self.delete_old(index)
+
try:
index.optimize()
finally:
from search import Search, JVM, SearchResult
from lucene import StringReader
from suggest.forms import PublishingSuggestForm
+from time import sleep
import re
import enchant
return query
+
JVM.attachCurrentThread()
-search = Search()
+_search = None
+
+
+def get_search():
+ global _search
+
+ while _search is False:
+ sleep(1)
+
+ if _search is None:
+ _search = False
+ _search = Search()
+ return _search
def hint(request):
return JSONResponse([])
JVM.attachCurrentThread()
+ search = get_search()
hint = search.hint()
try:
tags = request.GET.get('tags', '')
return render_to_response('catalogue/search_too_short.html', {'prefix': query},
context_instance=RequestContext(request))
+ search = get_search()
# hint.tags(tag_list)
# if book:
# hint.books(book)