1 # ====================================================================
2 # Licensed under the Apache License, Version 2.0 (the "License");
3 # you may not use this file except in compliance with the License.
4 # You may obtain a copy of the License at
6 # http://www.apache.org/licenses/LICENSE-2.0
8 # Unless required by applicable law or agreed to in writing, software
9 # distributed under the License is distributed on an "AS IS" BASIS,
10 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 # See the License for the specific language governing permissions and
12 # limitations under the License.
13 # ====================================================================
17 from unittest import TestCase
19 from datetime import timedelta
22 IndexWriter, SimpleAnalyzer, SimpleFSDirectory, Document, Field, \
26 class CompoundVersusMultiFileIndexTest(TestCase):
28 def __init__(self, *args):
30 super(CompoundVersusMultiFileIndexTest, self).__init__(*args)
31 self.docs = self.loadDocuments(5000, 10)
35 indexDir = os.path.join(System.getProperty("java.io.tmpdir", "tmp"),
38 cIndexDir = "%s-compound" %(indexDir)
39 mIndexDir = "%s-multi" %(indexDir)
43 self.cDir = SimpleFSDirectory(File(cIndexDir))
44 self.mDir = SimpleFSDirectory(File(mIndexDir))
48 for dir, dirnames, filenames in os.walk(dir):
49 for filename in filenames:
50 os.remove(os.path.join(dir, filename))
51 for dirname in dirnames:
52 os.rmdir(os.path.join(dir, dirname))
56 cTiming = self.timeIndexWriter(self.cDir, True)
57 mTiming = self.timeIndexWriter(self.mDir, False)
59 print "Compound Time :", cTiming
60 print "Multi-file Time:", mTiming
62 self.assert_(cTiming > mTiming)
64 def timeIndexWriter(self, dir, isCompound):
67 self.addDocuments(dir, isCompound)
69 return timedelta(seconds=time() - start)
71 def addDocuments(self, dir, isCompound):
73 writer = IndexWriter(dir, SimpleAnalyzer(), True,
74 IndexWriter.MaxFieldLength.LIMITED)
75 writer.setUseCompoundFile(isCompound)
77 # change to adjust performance of indexing with FSDirectory
78 # writer.mergeFactor = writer.mergeFactor
79 # writer.maxMergeDocs = writer.maxMergeDocs
80 # writer.minMergeDocs = writer.minMergeDocs
82 for word in self.docs:
84 doc.add(Field("keyword", word,
85 Field.Store.YES, Field.Index.NOT_ANALYZED))
86 doc.add(Field("unindexed", word,
87 Field.Store.YES, Field.Index.NO))
88 doc.add(Field("unstored", word,
89 Field.Store.NO, Field.Index.ANALYZED))
90 doc.add(Field("text", word,
91 Field.Store.YES, Field.Index.ANALYZED))
92 writer.addDocument(doc)
97 def loadDocuments(self, numDocs, wordsPerDoc):
99 return ["Bibamus " * wordsPerDoc] * numDocs