From: Marcin Koziej Date: Wed, 11 Apr 2012 09:24:59 +0000 (+0200) Subject: Merge branch 'lucene_memory' X-Git-Url: https://git.mdrn.pl/wolnelektury.git/commitdiff_plain/6e32f55e3ef50d9f7d3a291c2388c5220851a9b4?hp=d316a52151685a1b7c295baa12b73a73eabe5657 Merge branch 'lucene_memory' --- diff --git a/apps/search/index.py b/apps/search/index.py index 4e71e2500..b689c763c 100644 --- a/apps/search/index.py +++ b/apps/search/index.py @@ -317,14 +317,19 @@ class Index(BaseIndex): doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id))) return doc - def remove_book(self, book, remove_snippets=True): + def remove_book(self, book_or_id, remove_snippets=True): """Removes a book from search index. book - Book instance.""" - q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True) + if isinstance(book_or_id, catalogue.models.Book): + book_id = book_or_id.id + else: + book_id = book_or_id + + q = NumericRangeQuery.newIntRange("book_id", book_id, book_id, True, True) self.index.deleteDocuments(q) if remove_snippets: - snippets = Snippets(book.id) + snippets = Snippets(book_id) snippets.remove() def index_book(self, book, book_info=None, overwrite=True): @@ -339,7 +344,11 @@ class Index(BaseIndex): self.remove_book(book, remove_snippets=False) book_doc = self.create_book_doc(book) - meta_fields = self.extract_metadata(book, book_info) + meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title']) + # let's not index it - it's only used for extracting publish date + if 'source_name' in meta_fields: + del meta_fields['source_name'] + for f in meta_fields.values(): if isinstance(f, list) or isinstance(f, tuple): for elem in f: @@ -373,7 +382,7 @@ class Index(BaseIndex): published_date_re = re.compile("([0-9]+)[\]. ]*$") - def extract_metadata(self, book, book_info=None): + def extract_metadata(self, book, book_info=None, dc_only=None): """ Extract metadata from book and returns a map of fields keyed by fieldname """ @@ -388,6 +397,8 @@ class Index(BaseIndex): # validator, name for field in dcparser.BookInfo.FIELDS: + if dc_only and field.name not in dc_only: + continue if hasattr(book_info, field.name): if not getattr(book_info, field.name): continue @@ -1055,7 +1066,8 @@ class Search(IndexStore): return toks - def fuzziness(self, fuzzy): + @staticmethod + def fuzziness(fuzzy): """Helper method to sanitize fuzziness""" if not fuzzy: return None @@ -1092,7 +1104,8 @@ class Search(IndexStore): phrase.add(term) return phrase - def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False): + @staticmethod + def make_term_query(tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False): """ Returns term queries joined by boolean query. modal - applies to boolean query diff --git a/apps/search/management/commands/optimizeindex.py b/apps/search/management/commands/optimizeindex.py index a8a4cf9dd..51bf95b4e 100644 --- a/apps/search/management/commands/optimizeindex.py +++ b/apps/search/management/commands/optimizeindex.py @@ -1,14 +1,38 @@ from django.core.management.base import BaseCommand -from search import Index +from search import Index, Search +from lucene import IndexReader, IndexSearcher, Term +from catalogue.models import Book + class Command(BaseCommand): help = 'Optimize Lucene search index' args = '' + def delete_old(self, index): + existing_ids = set([book.id for book in Book.objects.all()]) + + reader = IndexReader.open(index.index, False) + searcher = IndexSearcher(reader) + try: + num = searcher.docFreq(Term('is_book', 'true')) + docs = searcher.search(Search.make_term_query(['true'], 'is_book'), num) + for result in docs.scoreDocs: + stored = searcher.doc(result.doc) + book_id = int(stored.get('book_id')) + if not book_id in existing_ids: + print "book id %d doesn't exist." % book_id + index.remove_book(book_id) + finally: + searcher.close() + reader.close() + def handle(self, *args, **opts): index = Index() index.open() + + self.delete_old(index) + try: index.optimize() finally: diff --git a/apps/search/views.py b/apps/search/views.py index 09f217f23..fd5883ede 100644 --- a/apps/search/views.py +++ b/apps/search/views.py @@ -14,6 +14,7 @@ from catalogue.views import JSONResponse from search import Search, JVM, SearchResult from lucene import StringReader from suggest.forms import PublishingSuggestForm +from time import sleep import re import enchant @@ -50,8 +51,21 @@ def did_you_mean(query, tokens): return query + JVM.attachCurrentThread() -search = Search() +_search = None + + +def get_search(): + global _search + + while _search is False: + sleep(1) + + if _search is None: + _search = False + _search = Search() + return _search def hint(request): @@ -60,6 +74,7 @@ def hint(request): return JSONResponse([]) JVM.attachCurrentThread() + search = get_search() hint = search.hint() try: tags = request.GET.get('tags', '') @@ -117,6 +132,7 @@ def main(request): return render_to_response('catalogue/search_too_short.html', {'prefix': query}, context_instance=RequestContext(request)) + search = get_search() # hint.tags(tag_list) # if book: # hint.books(book) diff --git a/wolnelektury/settings/__init__.py b/wolnelektury/settings/__init__.py index f2314a9ad..15126b9a4 100644 --- a/wolnelektury/settings/__init__.py +++ b/wolnelektury/settings/__init__.py @@ -63,6 +63,7 @@ INSTALLED_APPS_OUR = [ 'picture', 'social', 'waiter', + 'search', ] INSTALLED_APPS_CONTRIB = [