old python needs __main__ to call a module
[pylucene.git] / samples / LuceneInAction / lia / indexing / FSversusRAMDirectoryTest.py
1 # ====================================================================
2 #   Licensed under the Apache License, Version 2.0 (the "License");
3 #   you may not use this file except in compliance with the License.
4 #   You may obtain a copy of the License at
5 #
6 #       http://www.apache.org/licenses/LICENSE-2.0
7 #
8 #   Unless required by applicable law or agreed to in writing, software
9 #   distributed under the License is distributed on an "AS IS" BASIS,
10 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 #   See the License for the specific language governing permissions and
12 #   limitations under the License.
13 # ====================================================================
14
15 import os
16
17 from unittest import TestCase
18 from time import time
19 from datetime import timedelta
20
21 from lucene import \
22      IndexWriter, SimpleAnalyzer, Document, Field, System, File, \
23      SimpleFSDirectory, RAMDirectory
24
25
26 class FSversusRAMDirectoryTest(TestCase):
27
28     def __init__(self, *args):
29
30         super(FSversusRAMDirectoryTest, self).__init__(*args)
31         self.docs = self.loadDocuments(3000, 5)
32
33     def setUp(self):
34
35         fsIndexDir = os.path.join(System.getProperty("java.io.tmpdir", "tmp"),
36                                   "fs-index")
37         self.rmdir(fsIndexDir)
38         self.ramDir = RAMDirectory()
39         self.fsDir = SimpleFSDirectory(File(fsIndexDir))
40
41     def rmdir(self, dir):
42
43         for dir, dirnames, filenames in os.walk(dir):
44             for filename in filenames:
45                 os.remove(os.path.join(dir, filename))
46             for dirname in dirnames:
47                 os.rmdir(os.path.join(dir, dirname))
48
49     def testTiming(self):
50
51         ramTiming = self.timeIndexWriter(self.ramDir)
52         fsTiming = self.timeIndexWriter(self.fsDir)
53
54         #self.assert_(fsTiming > ramTiming)
55
56         print "RAMDirectory Time:", ramTiming
57         print "FSDirectory Time :", fsTiming
58
59     def timeIndexWriter(self, dir):
60
61         start = time()
62         self.addDocuments(dir)
63
64         return timedelta(seconds=time() - start)
65
66     def addDocuments(self, dir):
67
68         writer = IndexWriter(dir, SimpleAnalyzer(), True,
69                              IndexWriter.MaxFieldLength.UNLIMITED)
70
71         #
72         # change to adjust performance of indexing with FSDirectory
73         # writer.mergeFactor = writer.mergeFactor
74         # writer.maxMergeDocs = writer.maxMergeDocs
75         # writer.minMergeDocs = writer.minMergeDocs
76         #
77
78         for word in self.docs:
79             doc = Document()
80             doc.add(Field("keyword", word,
81                           Field.Store.YES, Field.Index.NOT_ANALYZED))
82             doc.add(Field("unindexed", word,
83                           Field.Store.YES, Field.Index.NO))
84             doc.add(Field("unstored", word,
85                           Field.Store.NO, Field.Index.ANALYZED))
86             doc.add(Field("text", word,
87                           Field.Store.YES, Field.Index.ANALYZED))
88             writer.addDocument(doc)
89
90         writer.optimize()
91         writer.close()
92
93     def loadDocuments(self, numDocs, wordsPerDoc):
94
95         return ["Bibamus " * wordsPerDoc] * numDocs