1 # ====================================================================
2 # Licensed under the Apache License, Version 2.0 (the "License");
3 # you may not use this file except in compliance with the License.
4 # You may obtain a copy of the License at
6 # http://www.apache.org/licenses/LICENSE-2.0
8 # Unless required by applicable law or agreed to in writing, software
9 # distributed under the License is distributed on an "AS IS" BASIS,
10 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 # See the License for the specific language governing permissions and
12 # limitations under the License.
13 # ====================================================================
16 SimpleAnalyzer, StandardAnalyzer, StringReader, Version, \
17 TermAttribute, PositionIncrementAttribute, TypeAttribute, OffsetAttribute
20 class AnalyzerUtils(object):
24 print "SimpleAnalyzer"
25 cls.displayTokensWithFullDetails(SimpleAnalyzer(),
26 "The quick brown fox....")
29 print "StandardAnalyzer"
30 cls.displayTokensWithFullDetails(StandardAnalyzer(Version.LUCENE_CURRENT), "I'll e-mail you at xyz@example.com")
32 def setPositionIncrement(cls, source, posIncr):
33 attr = source.addAttribute(PositionIncrementAttribute.class_)
34 attr.setPositionIncrement(posIncr)
36 def getPositionIncrement(cls, source):
37 attr = source.addAttribute(PositionIncrementAttribute.class_)
38 return attr.getPositionIncrement()
40 def setTerm(cls, source, term):
41 attr = source.addAttribute(TermAttribute.class_)
42 attr.setTermBuffer(term)
44 def getTerm(cls, source):
45 attr = source.addAttribute(TermAttribute.class_)
48 def setType(cls, source, type):
49 attr = source.addAttribute(TypeAttribute.class_)
52 def getType(cls, source):
53 attr = source.addAttribute(TypeAttribute.class_)
56 def displayTokens(cls, analyzer, text):
58 tokenStream = analyzer.tokenStream("contents", StringReader(text))
59 term = tokenStream.addAttribute(TermAttribute.class_)
61 while tokenStream.incrementToken():
62 print "[%s]" %(term.term()),
64 def displayTokensWithPositions(cls, analyzer, text):
66 stream = analyzer.tokenStream("contents", StringReader(text))
67 term = stream.addAttribute(TermAttribute.class_)
68 posIncr = stream.addAttribute(PositionIncrementAttribute.class_)
71 while stream.incrementToken():
72 increment = posIncr.getPositionIncrement()
74 position = position + increment
75 print "\n%d:" %(position),
77 print "[%s]" %(term.term()),
80 def displayTokensWithFullDetails(cls, analyzer, text):
82 stream = analyzer.tokenStream("contents", StringReader(text))
84 term = stream.addAttribute(TermAttribute.class_)
85 posIncr = stream.addAttribute(PositionIncrementAttribute.class_)
86 offset = stream.addAttribute(OffsetAttribute.class_)
87 type = stream.addAttribute(TypeAttribute.class_)
90 while stream.incrementToken():
91 increment = posIncr.getPositionIncrement()
93 position = position + increment
94 print "\n%d:" %(position),
96 print "[%s:%d->%d:%s]" %(term.term(),
102 def assertAnalyzesTo(cls, analyzer, input, outputs):
104 stream = analyzer.tokenStream("field", StringReader(input))
105 termAttr = stream.addAttribute(TermAttribute.class_)
106 for output in outputs:
107 if not stream.incrementToken():
108 raise AssertionError, 'stream.incremementToken()'
109 if output != termAttr.term():
110 raise AssertionError, 'output == termAttr.term())'
112 if stream.incrementToken():
113 raise AssertionError, 'not stream.incremementToken()'
117 main = classmethod(main)
118 setPositionIncrement = classmethod(setPositionIncrement)
119 getPositionIncrement = classmethod(getPositionIncrement)
120 setTerm = classmethod(setTerm)
121 getTerm = classmethod(getTerm)
122 setType = classmethod(setType)
123 getType = classmethod(getType)
124 displayTokens = classmethod(displayTokens)
125 displayTokensWithPositions = classmethod(displayTokensWithPositions)
126 displayTokensWithFullDetails = classmethod(displayTokensWithFullDetails)
127 assertAnalyzesTo = classmethod(assertAnalyzesTo)
130 if __name__ == "__main__":
132 AnalyzerUtils.main(sys.argv)