Our configuration
[pylucene.git] / test / test_Analyzers.py
1 # ====================================================================
2 #   Licensed under the Apache License, Version 2.0 (the "License");
3 #   you may not use this file except in compliance with the License.
4 #   You may obtain a copy of the License at
5 #
6 #       http://www.apache.org/licenses/LICENSE-2.0
7 #
8 #   Unless required by applicable law or agreed to in writing, software
9 #   distributed under the License is distributed on an "AS IS" BASIS,
10 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 #   See the License for the specific language governing permissions and
12 #   limitations under the License.
13 # ====================================================================
14
15 from unittest import main
16 from BaseTokenStreamTestCase import BaseTokenStreamTestCase
17 from lucene import *
18
19
20 class AnalyzersTestCase(BaseTokenStreamTestCase):
21     """
22     Unit tests ported from Java Lucene
23     """
24
25     def testSimple(self):
26
27         a = SimpleAnalyzer()
28         self._assertAnalyzesTo(a, "foo bar FOO BAR", 
29                                [ "foo", "bar", "foo", "bar" ])
30         self._assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", 
31                                [ "foo", "bar", "foo", "bar" ])
32         self._assertAnalyzesTo(a, "foo.bar.FOO.BAR", 
33                                [ "foo", "bar", "foo", "bar" ])
34         self._assertAnalyzesTo(a, "U.S.A.", 
35                                [ "u", "s", "a" ])
36         self._assertAnalyzesTo(a, "C++", 
37                                [ "c" ])
38         self._assertAnalyzesTo(a, "B2B", 
39                                [ "b", "b" ])
40         self._assertAnalyzesTo(a, "2B", 
41                                [ "b" ])
42         self._assertAnalyzesTo(a, "\"QUOTED\" word", 
43                                [ "quoted", "word" ])
44
45
46     def testNull(self):
47
48         a = WhitespaceAnalyzer()
49         self._assertAnalyzesTo(a, "foo bar FOO BAR", 
50                                [ "foo", "bar", "FOO", "BAR" ])
51         self._assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", 
52                                [ "foo", "bar", ".", "FOO", "<>", "BAR" ])
53         self._assertAnalyzesTo(a, "foo.bar.FOO.BAR", 
54                                [ "foo.bar.FOO.BAR" ])
55         self._assertAnalyzesTo(a, "U.S.A.", 
56                                [ "U.S.A." ])
57         self._assertAnalyzesTo(a, "C++", 
58                                [ "C++" ])
59         self._assertAnalyzesTo(a, "B2B", 
60                                [ "B2B" ])
61         self._assertAnalyzesTo(a, "2B", 
62                                [ "2B" ])
63         self._assertAnalyzesTo(a, "\"QUOTED\" word", 
64                                [ "\"QUOTED\"", "word" ])
65
66     def testStop(self):
67
68         a = StopAnalyzer(Version.LUCENE_CURRENT)
69         self._assertAnalyzesTo(a, "foo bar FOO BAR", 
70                                [ "foo", "bar", "foo", "bar" ])
71         self._assertAnalyzesTo(a, "foo a bar such FOO THESE BAR", 
72                                [ "foo", "bar", "foo", "bar" ])
73
74     def _verifyPayload(self, ts):
75
76         payloadAtt = ts.getAttribute(PayloadAttribute.class_)
77         b = 0
78         while True:
79             b += 1
80             if not ts.incrementToken():
81                 break
82             self.assertEqual(b, payloadAtt.getPayload().toByteArray()[0])
83
84     # Make sure old style next() calls result in a new copy of payloads
85     def testPayloadCopy(self):
86
87         s = "how now brown cow"
88         ts = WhitespaceTokenizer(StringReader(s))
89         ts = PayloadSetter(ts)
90         self._verifyPayload(ts)
91
92         ts = WhitespaceTokenizer(StringReader(s))
93         ts = PayloadSetter(ts)
94         self._verifyPayload(ts)
95
96
97 class PayloadSetter(PythonTokenFilter):
98
99     def __init__(self, input):
100         super(PayloadSetter, self).__init__(input)
101
102         self.input = input
103         self.payloadAtt = self.addAttribute(PayloadAttribute.class_)
104         self.data = JArray('byte')(1)
105         self.p = Payload(self.data, 0, 1)
106
107     def incrementToken(self):
108
109         if not self.input.incrementToken():
110             return False
111
112         self.payloadAtt.setPayload(self.p)
113         self.data[0] += 1;
114
115         return True
116
117
118 if __name__ == "__main__":
119     import sys, lucene
120     lucene.initVM()
121     if '-loop' in sys.argv:
122         sys.argv.remove('-loop')
123         while True:
124             try:
125                 main()
126             except:
127                 pass
128     else:
129          main()