some speedups for batch indexing

author Marcin Koziej <marcin@lolownia.org>

Tue, 31 Jan 2012 09:05:00 +0000 (10:05 +0100)

committer Marcin Koziej <marcin@lolownia.org>

Tue, 31 Jan 2012 09:05:00 +0000 (10:05 +0100)
author Marcin Koziej <marcin@lolownia.org>
Tue, 31 Jan 2012 09:05:00 +0000 (10:05 +0100)
committer Marcin Koziej <marcin@lolownia.org>
Tue, 31 Jan 2012 09:05:00 +0000 (10:05 +0100)
diff --git a/apps/catalogue/management/commands/importbooks.py b/apps/catalogue/management/commands/importbooks.py

index 5b4d499..7fe6022 100644 (file)
--- a/apps/catalogue/management/commands/importbooks.py
+++ b/apps/catalogue/management/commands/importbooks.py
@@ -6,7 +6,7 @@ import os
  import sys
  import time
  from optparse import make_option
-
+from django.conf import settings
  from django.core.management.base import BaseCommand
  from django.core.management.color import color_style
  from django.core.files import File
@@ -14,6 +14,7 @@ from django.core.files import File
  from catalogue.models import Book
  from picture.models import Picture
  
+from search import Index
  
  class Command(BaseCommand):
      option_list = BaseCommand.option_list + (
@@ -47,7 +48,8 @@ class Command(BaseCommand):
                                                      build_txt=options.get('build_txt'),
                                                      build_pdf=options.get('build_pdf'),
                                                      build_mobi=options.get('build_mobi'),
-                                                    search_index=options.get('search_index'))
+                                                    search_index=options.get('search_index'),
+                                                    search_index_reuse=True, search_index_tags=False)
          for ebook_format in Book.ebook_formats:
              if os.path.isfile(file_base + '.' + ebook_format):
                  getattr(book, '%s_file' % ebook_format).save(
@@ -80,6 +82,14 @@ class Command(BaseCommand):
                      time.strftime('%Y-%m-%d %H:%M:%S',
                      time.localtime(wait_until)), wait_until - time.time())
  
+        if options.get('search_index') and not settings.NO_SEARCH_INDEX:
+            index = Index()
+            index.open()
+            try:
+                index.index_tags()
+            finally:
+                index.close()
+
          # Start transaction management.
          transaction.commit_unless_managed()
          transaction.enter_transaction_management()
@@ -87,7 +97,7 @@ class Command(BaseCommand):
  
          files_imported = 0
          files_skipped = 0
-
+        
          for dir_name in directories:
              if not os.path.isdir(dir_name):
                  print self.style.ERROR("%s: Not a directory. Skipping." % dir_name)
diff --git a/apps/catalogue/models.py b/apps/catalogue/models.py

index 091e661..7105cfd 100644 (file)
--- a/apps/catalogue/models.py
+++ b/apps/catalogue/models.py
@@ -643,7 +643,7 @@ class Book(models.Model):
          result = create_zip.delay(paths, "%s_%s" % (self.slug, format_))
          return result.wait()
  
-    def search_index(self, book_info=None, reuse_index=False):
+    def search_index(self, book_info=None, reuse_index=False, index_tags=True):
          if reuse_index:
              idx = search.ReusableIndex()
          else:
@@ -652,7 +652,8 @@ class Book(models.Model):
          idx.open()
          try:
              idx.index_book(self, book_info)
-            idx.index_tags()
+            if index_tags:
+                idx.index_tags()
          finally:
              idx.close()
  
@@ -675,7 +676,7 @@ class Book(models.Model):
      @classmethod
      def from_text_and_meta(cls, raw_file, book_info, overwrite=False,
              build_epub=True, build_txt=True, build_pdf=True, build_mobi=True,
-            search_index=True):
+            search_index=True, search_index_tags=True, search_index_reuse=False):
          import re
          from sortify import sortify
  
@@ -747,7 +748,7 @@ class Book(models.Model):
              book.build_mobi()
  
          if not settings.NO_SEARCH_INDEX and search_index:
-            book.search_index()
+            book.search_index(index_tags=search_index_tags, reuse_index=search_index_reuse)
              #index_book.delay(book.id, book_info)
  
          book_descendants = list(book.children.all())
diff --git a/apps/search/index.py b/apps/search/index.py

index 71c0ed2..12554d2 100644 (file)
--- a/apps/search/index.py
+++ b/apps/search/index.py
@@ -331,11 +331,13 @@ class Index(BaseIndex):
                                                 (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
  
          # get published date
-        source = book_info.source_name
-        if hasattr(book_info, 'source_name'):
-            match = self.published_date_re.search(source)
+        pd = None
+        if hasattr(book_info, 'source_name') and book_info.source_name:
+            match = self.published_date_re.search(book_info.source_name)
              if match is not None:
-                fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
+                pd = str(match.groups()[0])
+        if not pd: pd = ""
+        fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
  
          return fields
  
@@ -476,7 +478,7 @@ class Index(BaseIndex):
                                         is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
                  
                          self.index.addDocument(doc)
-                        print "@ footnote text: %s" % footnote
+                        #print "@ footnote text: %s" % footnote
                          footnote = []
                      
                      # handle fragments and themes.
@@ -509,7 +511,7 @@ class Index(BaseIndex):
                                         fragment_anchor=fid,
                                         content=fix_format(frag['content']),
                                         themes=frag['themes'])
-                        print '@ FRAG %s' % frag['content']
+                        #print '@ FRAG %s' % frag['content']
                          self.index.addDocument(doc)
  
                          # Collect content.
@@ -522,7 +524,7 @@ class Index(BaseIndex):
                          # in the end, add a section text.
                  doc = add_part(snippets, header_index=position, header_type=header.tag,
                                 content=fix_format(content))
-                print '@ CONTENT: %s' % fix_format(content)
+                #print '@ CONTENT: %s' % fix_format(content)
  
                  self.index.addDocument(doc)
author	Marcin Koziej <marcin@lolownia.org>
	Tue, 31 Jan 2012 09:05:00 +0000 (10:05 +0100)
committer	Marcin Koziej <marcin@lolownia.org>
	Tue, 31 Jan 2012 09:05:00 +0000 (10:05 +0100)
apps/catalogue/management/commands/importbooks.py		patch \| blob \| history
apps/catalogue/models.py		patch \| blob \| history
apps/search/index.py		patch \| blob \| history