pylucene 3.5.0-3
[pylucene.git] / samples / LuceneInAction / lia / analysis / AnalyzerUtils.py
1 # ====================================================================
2 #   Licensed under the Apache License, Version 2.0 (the "License");
3 #   you may not use this file except in compliance with the License.
4 #   You may obtain a copy of the License at
5 #
6 #       http://www.apache.org/licenses/LICENSE-2.0
7 #
8 #   Unless required by applicable law or agreed to in writing, software
9 #   distributed under the License is distributed on an "AS IS" BASIS,
10 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 #   See the License for the specific language governing permissions and
12 #   limitations under the License.
13 # ====================================================================
14
15 from lucene import \
16      SimpleAnalyzer, StandardAnalyzer, StringReader, Version, \
17      TermAttribute, PositionIncrementAttribute, TypeAttribute, OffsetAttribute
18
19
20 class AnalyzerUtils(object):
21
22     def main(cls, argv):
23
24         print "SimpleAnalyzer"
25         cls.displayTokensWithFullDetails(SimpleAnalyzer(),
26                                          "The quick brown fox....")
27
28         print "\n----"
29         print "StandardAnalyzer"
30         cls.displayTokensWithFullDetails(StandardAnalyzer(Version.LUCENE_CURRENT), "I'll e-mail you at xyz@example.com")
31
32     def setPositionIncrement(cls, source, posIncr):
33         attr = source.addAttribute(PositionIncrementAttribute.class_)
34         attr.setPositionIncrement(posIncr)
35
36     def getPositionIncrement(cls, source):
37         attr = source.addAttribute(PositionIncrementAttribute.class_)
38         return attr.getPositionIncrement()
39
40     def setTerm(cls, source, term):
41         attr = source.addAttribute(TermAttribute.class_)
42         attr.setTermBuffer(term)
43
44     def getTerm(cls, source):
45         attr = source.addAttribute(TermAttribute.class_)
46         return attr.term()
47
48     def setType(cls, source, type):
49         attr = source.addAttribute(TypeAttribute.class_)
50         attr.setType(type)
51
52     def getType(cls, source):
53         attr = source.addAttribute(TypeAttribute.class_)
54         return attr.type()
55
56     def displayTokens(cls, analyzer, text):
57
58         tokenStream = analyzer.tokenStream("contents", StringReader(text))
59         term = tokenStream.addAttribute(TermAttribute.class_)
60
61         while tokenStream.incrementToken():
62             print "[%s]" %(term.term()),
63
64     def displayTokensWithPositions(cls, analyzer, text):
65
66         stream = analyzer.tokenStream("contents", StringReader(text))
67         term = stream.addAttribute(TermAttribute.class_)
68         posIncr = stream.addAttribute(PositionIncrementAttribute.class_)
69
70         position = 0
71         while stream.incrementToken():
72             increment = posIncr.getPositionIncrement()
73             if increment > 0:
74                 position = position + increment
75                 print "\n%d:" %(position),
76
77             print "[%s]" %(term.term()),
78         print
79
80     def displayTokensWithFullDetails(cls, analyzer, text):
81
82         stream = analyzer.tokenStream("contents", StringReader(text))
83
84         term = stream.addAttribute(TermAttribute.class_)
85         posIncr = stream.addAttribute(PositionIncrementAttribute.class_)
86         offset = stream.addAttribute(OffsetAttribute.class_)
87         type = stream.addAttribute(TypeAttribute.class_)
88
89         position = 0
90         while stream.incrementToken():
91             increment = posIncr.getPositionIncrement()
92             if increment > 0:
93                 position = position + increment
94                 print "\n%d:" %(position),
95
96             print "[%s:%d->%d:%s]" %(term.term(),
97                                      offset.startOffset(),
98                                      offset.endOffset(),
99                                      type.type()),
100         print
101
102     def assertAnalyzesTo(cls, analyzer, input, outputs):
103
104         stream = analyzer.tokenStream("field", StringReader(input))
105         termAttr = stream.addAttribute(TermAttribute.class_)
106         for output in outputs:
107             if not stream.incrementToken():
108                 raise AssertionError, 'stream.incremementToken()'
109             if output != termAttr.term():
110                 raise AssertionError, 'output == termAttr.term())'
111
112         if stream.incrementToken():
113             raise AssertionError, 'not stream.incremementToken()'
114
115         stream.close()
116
117     main = classmethod(main)
118     setPositionIncrement = classmethod(setPositionIncrement)
119     getPositionIncrement = classmethod(getPositionIncrement)
120     setTerm = classmethod(setTerm)
121     getTerm = classmethod(getTerm)
122     setType = classmethod(setType)
123     getType = classmethod(getType)
124     displayTokens = classmethod(displayTokens)
125     displayTokensWithPositions = classmethod(displayTokensWithPositions)
126     displayTokensWithFullDetails = classmethod(displayTokensWithFullDetails)
127     assertAnalyzesTo = classmethod(assertAnalyzesTo)
128
129
130 if __name__ == "__main__":
131     import sys
132     AnalyzerUtils.main(sys.argv)