1 package org.apache.lucene.analysis.cjk;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
22 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
23 import org.apache.lucene.analysis.Analyzer;
25 public class TestCJKTokenizer extends BaseTokenStreamTestCase {
34 public TestToken newToken(String termText, int start, int end, int type) {
35 TestToken token = new TestToken();
36 token.termText = termText;
37 token.type = CJKTokenizer.TOKEN_TYPE_NAMES[type];
43 public void checkCJKToken(final String str, final TestToken[] out_tokens) throws IOException {
44 Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT);
45 String terms[] = new String[out_tokens.length];
46 int startOffsets[] = new int[out_tokens.length];
47 int endOffsets[] = new int[out_tokens.length];
48 String types[] = new String[out_tokens.length];
49 for (int i = 0; i < out_tokens.length; i++) {
50 terms[i] = out_tokens[i].termText;
51 startOffsets[i] = out_tokens[i].start;
52 endOffsets[i] = out_tokens[i].end;
53 types[i] = out_tokens[i].type;
55 assertAnalyzesTo(analyzer, str, terms, startOffsets, endOffsets, types, null);
58 public void checkCJKTokenReusable(final Analyzer a, final String str, final TestToken[] out_tokens) throws IOException {
59 Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT);
60 String terms[] = new String[out_tokens.length];
61 int startOffsets[] = new int[out_tokens.length];
62 int endOffsets[] = new int[out_tokens.length];
63 String types[] = new String[out_tokens.length];
64 for (int i = 0; i < out_tokens.length; i++) {
65 terms[i] = out_tokens[i].termText;
66 startOffsets[i] = out_tokens[i].start;
67 endOffsets[i] = out_tokens[i].end;
68 types[i] = out_tokens[i].type;
70 assertAnalyzesToReuse(analyzer, str, terms, startOffsets, endOffsets, types, null);
73 public void testJa1() throws IOException {
74 String str = "\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341";
76 TestToken[] out_tokens = {
77 newToken("\u4e00\u4e8c", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
78 newToken("\u4e8c\u4e09", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
79 newToken("\u4e09\u56db", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
80 newToken("\u56db\u4e94", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
81 newToken("\u4e94\u516d", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE),
82 newToken("\u516d\u4e03", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE),
83 newToken("\u4e03\u516b", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
84 newToken("\u516b\u4e5d", 7, 9, CJKTokenizer.DOUBLE_TOKEN_TYPE),
85 newToken("\u4e5d\u5341", 8,10, CJKTokenizer.DOUBLE_TOKEN_TYPE)
87 checkCJKToken(str, out_tokens);
90 public void testJa2() throws IOException {
91 String str = "\u4e00 \u4e8c\u4e09\u56db \u4e94\u516d\u4e03\u516b\u4e5d \u5341";
93 TestToken[] out_tokens = {
94 newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
95 newToken("\u4e8c\u4e09", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
96 newToken("\u4e09\u56db", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
97 newToken("\u4e94\u516d", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
98 newToken("\u516d\u4e03", 7, 9, CJKTokenizer.DOUBLE_TOKEN_TYPE),
99 newToken("\u4e03\u516b", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
100 newToken("\u516b\u4e5d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
101 newToken("\u5341", 12,13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
103 checkCJKToken(str, out_tokens);
106 public void testC() throws IOException {
107 String str = "abc defgh ijklmn opqrstu vwxy z";
109 TestToken[] out_tokens = {
110 newToken("abc", 0, 3, CJKTokenizer.SINGLE_TOKEN_TYPE),
111 newToken("defgh", 4, 9, CJKTokenizer.SINGLE_TOKEN_TYPE),
112 newToken("ijklmn", 10, 16, CJKTokenizer.SINGLE_TOKEN_TYPE),
113 newToken("opqrstu", 17, 24, CJKTokenizer.SINGLE_TOKEN_TYPE),
114 newToken("vwxy", 25, 29, CJKTokenizer.SINGLE_TOKEN_TYPE),
115 newToken("z", 30, 31, CJKTokenizer.SINGLE_TOKEN_TYPE),
117 checkCJKToken(str, out_tokens);
120 public void testMix() throws IOException {
121 String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
123 TestToken[] out_tokens = {
124 newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
125 newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
126 newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
127 newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
128 newToken("abc", 5, 8, CJKTokenizer.SINGLE_TOKEN_TYPE),
129 newToken("\u304b\u304d", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
130 newToken("\u304d\u304f", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
131 newToken("\u304f\u3051", 10,12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
132 newToken("\u3051\u3053", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
134 checkCJKToken(str, out_tokens);
137 public void testMix2() throws IOException {
138 String str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053";
140 TestToken[] out_tokens = {
141 newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
142 newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
143 newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
144 newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
145 newToken("ab", 5, 7, CJKTokenizer.SINGLE_TOKEN_TYPE),
146 newToken("\u3093", 7, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
147 newToken("c", 8, 9, CJKTokenizer.SINGLE_TOKEN_TYPE),
148 newToken("\u304b\u304d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
149 newToken("\u304d\u304f", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
150 newToken("\u304f\u3051", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE),
151 newToken("\u3053", 14,15, CJKTokenizer.DOUBLE_TOKEN_TYPE)
153 checkCJKToken(str, out_tokens);
156 public void testSingleChar() throws IOException {
157 String str = "\u4e00";
159 TestToken[] out_tokens = {
160 newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
162 checkCJKToken(str, out_tokens);
166 * Full-width text is normalized to half-width
168 public void testFullWidth() throws Exception {
169 String str = "Test 1234";
170 TestToken[] out_tokens = {
171 newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE),
172 newToken("1234", 5, 9, CJKTokenizer.SINGLE_TOKEN_TYPE)
174 checkCJKToken(str, out_tokens);
178 * Non-english text (not just CJK) is treated the same as CJK: C1C2 C2C3
180 public void testNonIdeographic() throws Exception {
181 String str = "\u4e00 روبرت موير";
182 TestToken[] out_tokens = {
183 newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
184 newToken("رو", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
185 newToken("وب", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
186 newToken("بر", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE),
187 newToken("رت", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE),
188 newToken("مو", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
189 newToken("وي", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
190 newToken("ير", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE)
192 checkCJKToken(str, out_tokens);
196 * Non-english text with nonletters (non-spacing marks,etc) is treated as C1C2 C2C3,
197 * except for words are split around non-letters.
199 public void testNonIdeographicNonLetter() throws Exception {
200 String str = "\u4e00 رُوبرت موير";
201 TestToken[] out_tokens = {
202 newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
203 newToken("ر", 2, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
204 newToken("وب", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE),
205 newToken("بر", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE),
206 newToken("رت", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
207 newToken("مو", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
208 newToken("وي", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
209 newToken("ير", 11, 13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
211 checkCJKToken(str, out_tokens);
214 public void testTokenStream() throws Exception {
215 Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT);
216 assertAnalyzesTo(analyzer, "\u4e00\u4e01\u4e02",
217 new String[] { "\u4e00\u4e01", "\u4e01\u4e02"});
220 public void testReusableTokenStream() throws Exception {
221 Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT);
222 String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
224 TestToken[] out_tokens = {
225 newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
226 newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
227 newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
228 newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
229 newToken("abc", 5, 8, CJKTokenizer.SINGLE_TOKEN_TYPE),
230 newToken("\u304b\u304d", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
231 newToken("\u304d\u304f", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
232 newToken("\u304f\u3051", 10,12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
233 newToken("\u3051\u3053", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
235 checkCJKTokenReusable(analyzer, str, out_tokens);
237 str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053";
238 TestToken[] out_tokens2 = {
239 newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
240 newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
241 newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
242 newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
243 newToken("ab", 5, 7, CJKTokenizer.SINGLE_TOKEN_TYPE),
244 newToken("\u3093", 7, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
245 newToken("c", 8, 9, CJKTokenizer.SINGLE_TOKEN_TYPE),
246 newToken("\u304b\u304d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
247 newToken("\u304d\u304f", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
248 newToken("\u304f\u3051", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE),
249 newToken("\u3053", 14,15, CJKTokenizer.DOUBLE_TOKEN_TYPE)
251 checkCJKTokenReusable(analyzer, str, out_tokens2);
255 * LUCENE-2207: wrong offset calculated by end()
257 public void testFinalOffset() throws IOException {
258 checkCJKToken("あい", new TestToken[] {
259 newToken("あい", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE) });
260 checkCJKToken("あい ", new TestToken[] {
261 newToken("あい", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE) });
262 checkCJKToken("test", new TestToken[] {
263 newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE) });
264 checkCJKToken("test ", new TestToken[] {
265 newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE) });
266 checkCJKToken("あいtest", new TestToken[] {
267 newToken("あい", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
268 newToken("test", 2, 6, CJKTokenizer.SINGLE_TOKEN_TYPE) });
269 checkCJKToken("testあい ", new TestToken[] {
270 newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE),
271 newToken("あい", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE) });
274 /** blast some random strings through the analyzer */
275 public void testRandomStrings() throws Exception {
276 checkRandomData(random, new CJKAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);