1 # ====================================================================
2 # Licensed under the Apache License, Version 2.0 (the "License");
3 # you may not use this file except in compliance with the License.
4 # You may obtain a copy of the License at
6 # http://www.apache.org/licenses/LICENSE-2.0
8 # Unless required by applicable law or agreed to in writing, software
9 # distributed under the License is distributed on an "AS IS" BASIS,
10 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 # See the License for the specific language governing permissions and
12 # limitations under the License.
13 # ====================================================================
16 import lia.handlingtypes as handlingtypes
19 from datetime import timedelta
20 from lucene import IndexWriter, StandardAnalyzer
22 from lia.util.ClassLoader import ClassLoader
25 # A File Indexer capable of recursively indexing a directory tree.
26 # Based on lia.meetlucene.Indexer, but handling more than plaintext.
29 class FileIndexer(object):
34 print "Usage: python FileIndexer.py <index dir> <data dir>"
40 propsFile = os.path.join(os.path.dirname(handlingtypes.__file__),
41 'framework', 'handler.properties')
42 input = file(propsFile)
45 line = input.readline().strip()
48 if line.startswith('#'):
50 name, value = line.split('=')
51 props[name.strip()] = value.strip()
53 cls.handlerProps = props
56 numIndexed = cls.index(indexDir, dataDir)
57 duration = timedelta(seconds=time() - start)
59 print "Indexing %s files took %s" %(numIndexed, duration)
61 def index(cls, indexDir, dataDir):
63 if not (os.path.exists(dataDir) and os.path.isdir(dataDir)):
64 raise IOError, "%s does not exist or is not a directory" %(dataDir)
66 writer = IndexWriter(indexDir, StandardAnalyzer(), True,
67 IndexWriter.MaxFieldLength.UNLIMITED)
68 writer.setUseCompoundFile(False)
70 numIndexed = cls.indexDirectory(writer, dataDir)
76 def indexDirectory(cls, writer, dir):
81 for name in os.listdir(dir):
82 path = os.path.join(dir, name)
83 if os.path.isfile(path):
84 doc = cls.indexFile(writer, path)
87 elif os.path.isdir(path) and not name.startswith('.'):
91 count += cls.indexDirectory(writer, dir)
95 def indexFile(cls, writer, path):
97 name, ext = os.path.splitext(path)
98 if ext.startswith(os.path.extsep):
99 ext = ext[len(os.path.extsep):]
102 handlerClassName = cls.handlerProps.get(ext, None)
103 if handlerClassName is None:
104 print "error indexing %s: no handler for %s files" %(path, ext)
108 handlerClass = ClassLoader.loadClass(handlerClassName)
109 handler = handlerClass()
111 doc = handler.indexFile(writer, path)
113 print 'indexed', path
119 print 'error indexing %s: %s' %(path, e)
122 main = classmethod(main)
123 index = classmethod(index)
124 indexDirectory = classmethod(indexDirectory)
125 indexFile = classmethod(indexFile)