old python needs __main__ to call a module
[pylucene.git] / samples / LuceneInAction / lia / meetlucene / Indexer.py
1 # ====================================================================
2 #   Licensed under the Apache License, Version 2.0 (the "License");
3 #   you may not use this file except in compliance with the License.
4 #   You may obtain a copy of the License at
5 #
6 #       http://www.apache.org/licenses/LICENSE-2.0
7 #
8 #   Unless required by applicable law or agreed to in writing, software
9 #   distributed under the License is distributed on an "AS IS" BASIS,
10 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 #   See the License for the specific language governing permissions and
12 #   limitations under the License.
13 # ====================================================================
14
15 import os
16
17 from time import time
18 from datetime import timedelta
19 from lucene import \
20     IndexWriter, StandardAnalyzer, Document, Field, \
21     InputStreamReader, FileInputStream, Version, SimpleFSDirectory, File
22
23
24 class Indexer(object):
25
26     def main(cls, argv):
27
28         if len(argv) != 3:
29             print "Usage: python Indexer.py <index dir> <data dir>"
30
31         else:
32             indexDir = argv[1]
33             dataDir = argv[2]
34
35             start = time()
36             numIndexed = cls.index(indexDir, dataDir)
37             duration = timedelta(seconds=time() - start)
38
39             print "Indexing %s files took %s" %(numIndexed, duration)
40
41     def index(cls, indexDir, dataDir):
42
43         if not (os.path.exists(dataDir) and os.path.isdir(dataDir)):
44             raise IOError, "%s does not exist or is not a directory" %(dataDir)
45
46         dir = SimpleFSDirectory(File(indexDir))
47         writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT),
48                              True, IndexWriter.MaxFieldLength.LIMITED)
49         writer.setUseCompoundFile(False)
50
51         cls.indexDirectory(writer, dataDir)
52
53         numIndexed = writer.numDocs()
54         writer.optimize()
55         writer.close()
56         dir.close()
57
58         return numIndexed
59
60     def indexDirectory(cls, writer, dir):
61
62         for name in os.listdir(dir):
63             path = os.path.join(dir, name)
64             if os.path.isfile(path):
65                 if path.endswith('.txt'):
66                     cls.indexFile(writer, path)
67             elif os.path.isdir(path):
68                 cls.indexDirectory(writer, path)
69
70     def indexFile(cls, writer, path):
71
72         try:
73             reader = InputStreamReader(FileInputStream(path), 'iso-8859-1')
74         except IOError, e:
75             print 'IOError while opening %s: %s' %(path, e)
76         else:
77             print 'Indexing', path
78             doc = Document()
79             doc.add(Field("contents", reader))
80             doc.add(Field("path", os.path.abspath(path),
81                           Field.Store.YES, Field.Index.NOT_ANALYZED))
82             writer.addDocument(doc)
83             reader.close()
84
85     main = classmethod(main)
86     index = classmethod(index)
87     indexDirectory = classmethod(indexDirectory)
88     indexFile = classmethod(indexFile)
89
90
91 if __name__ == "__main__":
92     import sys
93     Indexer.main(sys.argv)