From: Marcin Koziej Date: Tue, 31 Jan 2012 09:05:00 +0000 (+0100) Subject: some speedups for batch indexing X-Git-Url: https://git.mdrn.pl/wolnelektury.git/commitdiff_plain/a31dc3ca0f28d89b0b18b9d4c857f64ac9488b7a?ds=inline some speedups for batch indexing --- diff --git a/apps/catalogue/management/commands/importbooks.py b/apps/catalogue/management/commands/importbooks.py index 5b4d499a4..7fe602254 100644 --- a/apps/catalogue/management/commands/importbooks.py +++ b/apps/catalogue/management/commands/importbooks.py @@ -6,7 +6,7 @@ import os import sys import time from optparse import make_option - +from django.conf import settings from django.core.management.base import BaseCommand from django.core.management.color import color_style from django.core.files import File @@ -14,6 +14,7 @@ from django.core.files import File from catalogue.models import Book from picture.models import Picture +from search import Index class Command(BaseCommand): option_list = BaseCommand.option_list + ( @@ -47,7 +48,8 @@ class Command(BaseCommand): build_txt=options.get('build_txt'), build_pdf=options.get('build_pdf'), build_mobi=options.get('build_mobi'), - search_index=options.get('search_index')) + search_index=options.get('search_index'), + search_index_reuse=True, search_index_tags=False) for ebook_format in Book.ebook_formats: if os.path.isfile(file_base + '.' + ebook_format): getattr(book, '%s_file' % ebook_format).save( @@ -80,6 +82,14 @@ class Command(BaseCommand): time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(wait_until)), wait_until - time.time()) + if options.get('search_index') and not settings.NO_SEARCH_INDEX: + index = Index() + index.open() + try: + index.index_tags() + finally: + index.close() + # Start transaction management. transaction.commit_unless_managed() transaction.enter_transaction_management() @@ -87,7 +97,7 @@ class Command(BaseCommand): files_imported = 0 files_skipped = 0 - + for dir_name in directories: if not os.path.isdir(dir_name): print self.style.ERROR("%s: Not a directory. Skipping." % dir_name) diff --git a/apps/catalogue/models.py b/apps/catalogue/models.py index 091e66148..7105cfd8a 100644 --- a/apps/catalogue/models.py +++ b/apps/catalogue/models.py @@ -643,7 +643,7 @@ class Book(models.Model): result = create_zip.delay(paths, "%s_%s" % (self.slug, format_)) return result.wait() - def search_index(self, book_info=None, reuse_index=False): + def search_index(self, book_info=None, reuse_index=False, index_tags=True): if reuse_index: idx = search.ReusableIndex() else: @@ -652,7 +652,8 @@ class Book(models.Model): idx.open() try: idx.index_book(self, book_info) - idx.index_tags() + if index_tags: + idx.index_tags() finally: idx.close() @@ -675,7 +676,7 @@ class Book(models.Model): @classmethod def from_text_and_meta(cls, raw_file, book_info, overwrite=False, build_epub=True, build_txt=True, build_pdf=True, build_mobi=True, - search_index=True): + search_index=True, search_index_tags=True, search_index_reuse=False): import re from sortify import sortify @@ -747,7 +748,7 @@ class Book(models.Model): book.build_mobi() if not settings.NO_SEARCH_INDEX and search_index: - book.search_index() + book.search_index(index_tags=search_index_tags, reuse_index=search_index_reuse) #index_book.delay(book.id, book_info) book_descendants = list(book.children.all()) diff --git a/apps/search/index.py b/apps/search/index.py index 71c0ed236..12554d2be 100644 --- a/apps/search/index.py +++ b/apps/search/index.py @@ -331,11 +331,13 @@ class Index(BaseIndex): (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED) # get published date - source = book_info.source_name - if hasattr(book_info, 'source_name'): - match = self.published_date_re.search(source) + pd = None + if hasattr(book_info, 'source_name') and book_info.source_name: + match = self.published_date_re.search(book_info.source_name) if match is not None: - fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED) + pd = str(match.groups()[0]) + if not pd: pd = "" + fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED) return fields @@ -476,7 +478,7 @@ class Index(BaseIndex): is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)) self.index.addDocument(doc) - print "@ footnote text: %s" % footnote + #print "@ footnote text: %s" % footnote footnote = [] # handle fragments and themes. @@ -509,7 +511,7 @@ class Index(BaseIndex): fragment_anchor=fid, content=fix_format(frag['content']), themes=frag['themes']) - print '@ FRAG %s' % frag['content'] + #print '@ FRAG %s' % frag['content'] self.index.addDocument(doc) # Collect content. @@ -522,7 +524,7 @@ class Index(BaseIndex): # in the end, add a section text. doc = add_part(snippets, header_index=position, header_type=header.tag, content=fix_format(content)) - print '@ CONTENT: %s' % fix_format(content) + #print '@ CONTENT: %s' % fix_format(content) self.index.addDocument(doc)