1 # ====================================================================
2 # Licensed under the Apache License, Version 2.0 (the "License");
3 # you may not use this file except in compliance with the License.
4 # You may obtain a copy of the License at
6 # http://www.apache.org/licenses/LICENSE-2.0
8 # Unless required by applicable law or agreed to in writing, software
9 # distributed under the License is distributed on an "AS IS" BASIS,
10 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 # See the License for the specific language governing permissions and
12 # limitations under the License.
13 # ====================================================================
15 # Author: Erik Hatcher
17 # to index all man pages on $MANPATH or /usr/share/man:
18 # python manindex.py pages
19 # ====================================================================
22 from subprocess import *
23 from lucene import IndexWriter, StandardAnalyzer, Document, Field
24 from lucene import SimpleFSDirectory, File, initVM, Version
26 def indexDirectory(dir):
28 for name in os.listdir(dir):
29 path = os.path.join(dir, name)
30 if os.path.isfile(path):
34 def indexFile(dir,filename):
36 path = os.path.join(dir, filename)
37 print " File: ", filename
39 if filename.endswith('.gz'):
40 child = Popen('gunzip -c ' + path + ' | groff -t -e -E -mandoc -Tascii | col -bx', shell=True, stdout=PIPE, cwd=os.path.dirname(dir)).stdout
41 command, section = re.search('^(.*)\.(.*)\.gz$', filename).groups()
43 child = Popen('groff -t -e -E -mandoc -Tascii ' + path + ' | col -bx',
44 shell=True, stdout=PIPE, cwd=os.path.dirname(dir)).stdout
45 command, section = re.search('^(.*)\.(.*)$', filename).groups()
50 raise RuntimeError, '%s failed with exit code %d' %(command, err)
52 matches = re.search('^NAME$(.*?)^\S', data,
53 re.MULTILINE | re.DOTALL)
54 name = matches and matches.group(1) or ''
56 matches = re.search('^(?:SYNOPSIS|SYNOPSYS)$(.*?)^\S', data,
57 re.MULTILINE | re.DOTALL)
58 synopsis = matches and matches.group(1) or ''
60 matches = re.search('^(?:DESCRIPTION|OVERVIEW)$(.*?)', data,
61 re.MULTILINE | re.DOTALL)
62 description = matches and matches.group(1) or ''
65 doc.add(Field("command", command,
66 Field.Store.YES, Field.Index.NOT_ANALYZED))
67 doc.add(Field("section", section,
68 Field.Store.YES, Field.Index.NOT_ANALYZED))
69 doc.add(Field("name", name.strip(),
70 Field.Store.YES, Field.Index.ANALYZED))
71 doc.add(Field("synopsis", synopsis.strip(),
72 Field.Store.YES, Field.Index.ANALYZED))
73 doc.add(Field("keywords", ' '.join((command, name, synopsis, description)),
74 Field.Store.NO, Field.Index.ANALYZED))
75 doc.add(Field("filename", os.path.abspath(path),
76 Field.Store.YES, Field.Index.NOT_ANALYZED))
78 writer.addDocument(doc)
81 if __name__ == '__main__':
83 if len(sys.argv) != 2:
84 print "Usage: python manindex.py <index dir>"
88 indexDir = sys.argv[1]
89 writer = IndexWriter(SimpleFSDirectory(File(indexDir)),
90 StandardAnalyzer(Version.LUCENE_CURRENT), True,
91 IndexWriter.MaxFieldLength.LIMITED)
92 manpath = os.environ.get('MANPATH', '/usr/share/man').split(os.pathsep)
95 for name in os.listdir(dir):
96 path = os.path.join(dir, name)
97 if os.path.isdir(path):