1 # ====================================================================
2 # Licensed under the Apache License, Version 2.0 (the "License");
3 # you may not use this file except in compliance with the License.
4 # You may obtain a copy of the License at
6 # http://www.apache.org/licenses/LICENSE-2.0
8 # Unless required by applicable law or agreed to in writing, software
9 # distributed under the License is distributed on an "AS IS" BASIS,
10 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 # See the License for the specific language governing permissions and
12 # limitations under the License.
13 # ====================================================================
18 from datetime import timedelta
20 IndexWriter, StandardAnalyzer, Document, Field, \
21 InputStreamReader, FileInputStream, Version, SimpleFSDirectory, File
24 class Indexer(object):
29 print "Usage: python Indexer.py <index dir> <data dir>"
36 numIndexed = cls.index(indexDir, dataDir)
37 duration = timedelta(seconds=time() - start)
39 print "Indexing %s files took %s" %(numIndexed, duration)
41 def index(cls, indexDir, dataDir):
43 if not (os.path.exists(dataDir) and os.path.isdir(dataDir)):
44 raise IOError, "%s does not exist or is not a directory" %(dataDir)
46 dir = SimpleFSDirectory(File(indexDir))
47 writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT),
48 True, IndexWriter.MaxFieldLength.LIMITED)
49 writer.setUseCompoundFile(False)
51 cls.indexDirectory(writer, dataDir)
53 numIndexed = writer.numDocs()
60 def indexDirectory(cls, writer, dir):
62 for name in os.listdir(dir):
63 path = os.path.join(dir, name)
64 if os.path.isfile(path):
65 if path.endswith('.txt'):
66 cls.indexFile(writer, path)
67 elif os.path.isdir(path):
68 cls.indexDirectory(writer, path)
70 def indexFile(cls, writer, path):
73 reader = InputStreamReader(FileInputStream(path), 'iso-8859-1')
75 print 'IOError while opening %s: %s' %(path, e)
77 print 'Indexing', path
79 doc.add(Field("contents", reader))
80 doc.add(Field("path", os.path.abspath(path),
81 Field.Store.YES, Field.Index.NOT_ANALYZED))
82 writer.addDocument(doc)
85 main = classmethod(main)
86 index = classmethod(index)
87 indexDirectory = classmethod(indexDirectory)
88 indexFile = classmethod(indexFile)
91 if __name__ == "__main__":
93 Indexer.main(sys.argv)