pylucene 3.5.0-3
[pylucene.git] / samples / LuceneInAction / lia / indexing / CompoundVersusMultiFileIndexTest.py
1 # ====================================================================
2 #   Licensed under the Apache License, Version 2.0 (the "License");
3 #   you may not use this file except in compliance with the License.
4 #   You may obtain a copy of the License at
5 #
6 #       http://www.apache.org/licenses/LICENSE-2.0
7 #
8 #   Unless required by applicable law or agreed to in writing, software
9 #   distributed under the License is distributed on an "AS IS" BASIS,
10 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 #   See the License for the specific language governing permissions and
12 #   limitations under the License.
13 # ====================================================================
14
15 import os
16
17 from unittest import TestCase
18 from time import time
19 from datetime import timedelta
20
21 from lucene import \
22      IndexWriter, SimpleAnalyzer, SimpleFSDirectory, Document, Field, \
23      System, File
24
25
26 class CompoundVersusMultiFileIndexTest(TestCase):
27
28     def __init__(self, *args):
29
30         super(CompoundVersusMultiFileIndexTest, self).__init__(*args)
31         self.docs = self.loadDocuments(5000, 10)
32
33     def setUp(self):
34
35         indexDir = os.path.join(System.getProperty("java.io.tmpdir", "tmp"),
36                                 "index-dir")
37
38         cIndexDir = "%s-compound" %(indexDir)
39         mIndexDir = "%s-multi" %(indexDir)
40         self.rmdir(cIndexDir)
41         self.rmdir(mIndexDir)
42
43         self.cDir = SimpleFSDirectory(File(cIndexDir))
44         self.mDir = SimpleFSDirectory(File(mIndexDir))
45
46     def rmdir(self, dir):
47
48         for dir, dirnames, filenames in os.walk(dir):
49             for filename in filenames:
50                 os.remove(os.path.join(dir, filename))
51             for dirname in dirnames:
52                 os.rmdir(os.path.join(dir, dirname))
53
54     def testTiming(self):
55
56         cTiming = self.timeIndexWriter(self.cDir, True)
57         mTiming = self.timeIndexWriter(self.mDir, False)
58
59         print "Compound Time :", cTiming
60         print "Multi-file Time:", mTiming
61
62         self.assert_(cTiming > mTiming)
63
64     def timeIndexWriter(self, dir, isCompound):
65
66         start = time()
67         self.addDocuments(dir, isCompound)
68
69         return timedelta(seconds=time() - start)
70
71     def addDocuments(self, dir, isCompound):
72
73         writer = IndexWriter(dir, SimpleAnalyzer(), True,
74                              IndexWriter.MaxFieldLength.LIMITED)
75         writer.setUseCompoundFile(isCompound)
76
77         # change to adjust performance of indexing with FSDirectory
78         # writer.mergeFactor = writer.mergeFactor
79         # writer.maxMergeDocs = writer.maxMergeDocs
80         # writer.minMergeDocs = writer.minMergeDocs
81
82         for word in self.docs:
83             doc = Document()
84             doc.add(Field("keyword", word,
85                           Field.Store.YES, Field.Index.NOT_ANALYZED))
86             doc.add(Field("unindexed", word,
87                           Field.Store.YES, Field.Index.NO))
88             doc.add(Field("unstored", word,
89                           Field.Store.NO, Field.Index.ANALYZED))
90             doc.add(Field("text", word,
91                           Field.Store.YES, Field.Index.ANALYZED))
92             writer.addDocument(doc)
93
94         writer.optimize()
95         writer.close()
96
97     def loadDocuments(self, numDocs, wordsPerDoc):
98
99         return ["Bibamus " * wordsPerDoc] * numDocs