some speedups for batch indexing
authorMarcin Koziej <marcin@lolownia.org>
Tue, 31 Jan 2012 09:05:00 +0000 (10:05 +0100)
committerMarcin Koziej <marcin@lolownia.org>
Tue, 31 Jan 2012 09:05:00 +0000 (10:05 +0100)
apps/catalogue/management/commands/importbooks.py
apps/catalogue/models.py
apps/search/index.py

index 5b4d499..7fe6022 100644 (file)
@@ -6,7 +6,7 @@ import os
 import sys
 import time
 from optparse import make_option
 import sys
 import time
 from optparse import make_option
-
+from django.conf import settings
 from django.core.management.base import BaseCommand
 from django.core.management.color import color_style
 from django.core.files import File
 from django.core.management.base import BaseCommand
 from django.core.management.color import color_style
 from django.core.files import File
@@ -14,6 +14,7 @@ from django.core.files import File
 from catalogue.models import Book
 from picture.models import Picture
 
 from catalogue.models import Book
 from picture.models import Picture
 
+from search import Index
 
 class Command(BaseCommand):
     option_list = BaseCommand.option_list + (
 
 class Command(BaseCommand):
     option_list = BaseCommand.option_list + (
@@ -47,7 +48,8 @@ class Command(BaseCommand):
                                                     build_txt=options.get('build_txt'),
                                                     build_pdf=options.get('build_pdf'),
                                                     build_mobi=options.get('build_mobi'),
                                                     build_txt=options.get('build_txt'),
                                                     build_pdf=options.get('build_pdf'),
                                                     build_mobi=options.get('build_mobi'),
-                                                    search_index=options.get('search_index'))
+                                                    search_index=options.get('search_index'),
+                                                    search_index_reuse=True, search_index_tags=False)
         for ebook_format in Book.ebook_formats:
             if os.path.isfile(file_base + '.' + ebook_format):
                 getattr(book, '%s_file' % ebook_format).save(
         for ebook_format in Book.ebook_formats:
             if os.path.isfile(file_base + '.' + ebook_format):
                 getattr(book, '%s_file' % ebook_format).save(
@@ -80,6 +82,14 @@ class Command(BaseCommand):
                     time.strftime('%Y-%m-%d %H:%M:%S',
                     time.localtime(wait_until)), wait_until - time.time())
 
                     time.strftime('%Y-%m-%d %H:%M:%S',
                     time.localtime(wait_until)), wait_until - time.time())
 
+        if options.get('search_index') and not settings.NO_SEARCH_INDEX:
+            index = Index()
+            index.open()
+            try:
+                index.index_tags()
+            finally:
+                index.close()
+
         # Start transaction management.
         transaction.commit_unless_managed()
         transaction.enter_transaction_management()
         # Start transaction management.
         transaction.commit_unless_managed()
         transaction.enter_transaction_management()
@@ -87,7 +97,7 @@ class Command(BaseCommand):
 
         files_imported = 0
         files_skipped = 0
 
         files_imported = 0
         files_skipped = 0
-
+        
         for dir_name in directories:
             if not os.path.isdir(dir_name):
                 print self.style.ERROR("%s: Not a directory. Skipping." % dir_name)
         for dir_name in directories:
             if not os.path.isdir(dir_name):
                 print self.style.ERROR("%s: Not a directory. Skipping." % dir_name)
index 091e661..7105cfd 100644 (file)
@@ -643,7 +643,7 @@ class Book(models.Model):
         result = create_zip.delay(paths, "%s_%s" % (self.slug, format_))
         return result.wait()
 
         result = create_zip.delay(paths, "%s_%s" % (self.slug, format_))
         return result.wait()
 
-    def search_index(self, book_info=None, reuse_index=False):
+    def search_index(self, book_info=None, reuse_index=False, index_tags=True):
         if reuse_index:
             idx = search.ReusableIndex()
         else:
         if reuse_index:
             idx = search.ReusableIndex()
         else:
@@ -652,7 +652,8 @@ class Book(models.Model):
         idx.open()
         try:
             idx.index_book(self, book_info)
         idx.open()
         try:
             idx.index_book(self, book_info)
-            idx.index_tags()
+            if index_tags:
+                idx.index_tags()
         finally:
             idx.close()
 
         finally:
             idx.close()
 
@@ -675,7 +676,7 @@ class Book(models.Model):
     @classmethod
     def from_text_and_meta(cls, raw_file, book_info, overwrite=False,
             build_epub=True, build_txt=True, build_pdf=True, build_mobi=True,
     @classmethod
     def from_text_and_meta(cls, raw_file, book_info, overwrite=False,
             build_epub=True, build_txt=True, build_pdf=True, build_mobi=True,
-            search_index=True):
+            search_index=True, search_index_tags=True, search_index_reuse=False):
         import re
         from sortify import sortify
 
         import re
         from sortify import sortify
 
@@ -747,7 +748,7 @@ class Book(models.Model):
             book.build_mobi()
 
         if not settings.NO_SEARCH_INDEX and search_index:
             book.build_mobi()
 
         if not settings.NO_SEARCH_INDEX and search_index:
-            book.search_index()
+            book.search_index(index_tags=search_index_tags, reuse_index=search_index_reuse)
             #index_book.delay(book.id, book_info)
 
         book_descendants = list(book.children.all())
             #index_book.delay(book.id, book_info)
 
         book_descendants = list(book.children.all())
index 71c0ed2..12554d2 100644 (file)
@@ -331,11 +331,13 @@ class Index(BaseIndex):
                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 
         # get published date
                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 
         # get published date
-        source = book_info.source_name
-        if hasattr(book_info, 'source_name'):
-            match = self.published_date_re.search(source)
+        pd = None
+        if hasattr(book_info, 'source_name') and book_info.source_name:
+            match = self.published_date_re.search(book_info.source_name)
             if match is not None:
             if match is not None:
-                fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
+                pd = str(match.groups()[0])
+        if not pd: pd = ""
+        fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
 
         return fields
 
 
         return fields
 
@@ -476,7 +478,7 @@ class Index(BaseIndex):
                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
                 
                         self.index.addDocument(doc)
                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
                 
                         self.index.addDocument(doc)
-                        print "@ footnote text: %s" % footnote
+                        #print "@ footnote text: %s" % footnote
                         footnote = []
                     
                     # handle fragments and themes.
                         footnote = []
                     
                     # handle fragments and themes.
@@ -509,7 +511,7 @@ class Index(BaseIndex):
                                        fragment_anchor=fid,
                                        content=fix_format(frag['content']),
                                        themes=frag['themes'])
                                        fragment_anchor=fid,
                                        content=fix_format(frag['content']),
                                        themes=frag['themes'])
-                        print '@ FRAG %s' % frag['content']
+                        #print '@ FRAG %s' % frag['content']
                         self.index.addDocument(doc)
 
                         # Collect content.
                         self.index.addDocument(doc)
 
                         # Collect content.
@@ -522,7 +524,7 @@ class Index(BaseIndex):
                         # in the end, add a section text.
                 doc = add_part(snippets, header_index=position, header_type=header.tag,
                                content=fix_format(content))
                         # in the end, add a section text.
                 doc = add_part(snippets, header_index=position, header_type=header.tag,
                                content=fix_format(content))
-                print '@ CONTENT: %s' % fix_format(content)
+                #print '@ CONTENT: %s' % fix_format(content)
 
                 self.index.addDocument(doc)
 
 
                 self.index.addDocument(doc)