1 package org.apache.lucene.analysis.th;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.StringReader;
22 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
23 import org.apache.lucene.analysis.TokenStream;
24 import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
25 import org.apache.lucene.util.Version;
28 * Test case for ThaiAnalyzer, modified from TestFrenchAnalyzer
33 public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
36 * testcase for offsets
38 public void testOffsets() throws Exception {
39 assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
40 assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี",
41 new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
42 new int[] { 0, 3, 6, 9, 13, 17, 20, 23 },
43 new int[] { 3, 6, 9, 13, 17, 20, 23, 25 });
46 public void testTokenType() throws Exception {
47 assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
48 assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี ๑๒๓",
49 new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" },
50 new String[] { "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
51 "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
52 "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
53 "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
58 * Thai numeric tokens were typed as <ALPHANUM> instead of <NUM>.
59 * @deprecated testing backwards behavior
62 public void testBuggyTokenType30() throws Exception {
63 assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
64 assertAnalyzesTo(new ThaiAnalyzer(Version.LUCENE_30), "การที่ได้ต้องแสดงว่างานดี ๑๒๓",
65 new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" },
66 new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
67 "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
68 "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" });
71 /** @deprecated testing backwards behavior */
73 public void testAnalyzer30() throws Exception {
74 assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
75 ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_30);
77 assertAnalyzesTo(analyzer, "", new String[] {});
81 "การที่ได้ต้องแสดงว่างานดี",
82 new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
86 "บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
87 new String[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
92 "ประโยคว่า The quick brown fox jumped over the lazy dogs",
93 new String[] { "ประโยค", "ว่า", "quick", "brown", "fox", "jumped", "over", "lazy", "dogs" });
97 * Test that position increments are adjusted correctly for stopwords.
99 public void testPositionIncrements() throws Exception {
100 assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
101 ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
103 assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้อง the แสดงว่างานดี",
104 new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
105 new int[] { 0, 3, 6, 9, 18, 22, 25, 28 },
106 new int[] { 3, 6, 9, 13, 22, 25, 28, 30 },
107 new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });
109 // case that a stopword is adjacent to thai text, with no whitespace
110 assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องthe แสดงว่างานดี",
111 new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
112 new int[] { 0, 3, 6, 9, 17, 21, 24, 27 },
113 new int[] { 3, 6, 9, 13, 21, 24, 27, 29 },
114 new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });
117 public void testReusableTokenStream() throws Exception {
118 assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
119 ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
120 assertAnalyzesToReuse(analyzer, "", new String[] {});
122 assertAnalyzesToReuse(
124 "การที่ได้ต้องแสดงว่างานดี",
125 new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
127 assertAnalyzesToReuse(
129 "บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
130 new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz", "demo.com" });
133 /** @deprecated, for version back compat */
135 public void testReusableTokenStream30() throws Exception {
136 assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
137 ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_30);
138 assertAnalyzesToReuse(analyzer, "", new String[] {});
140 assertAnalyzesToReuse(
142 "การที่ได้ต้องแสดงว่างานดี",
143 new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
145 assertAnalyzesToReuse(
147 "บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
148 new String[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
151 /** blast some random strings through the analyzer */
152 public void testRandomStrings() throws Exception {
153 checkRandomData(random, new ThaiAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
157 public void testAttributeReuse() throws Exception {
158 assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
159 ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_30);
161 TokenStream ts = analyzer.reusableTokenStream("dummy", new StringReader("ภาษาไทย"));
162 assertTokenStreamContents(ts, new String[] { "ภาษา", "ไทย" });
163 // this consumer adds flagsAtt, which this analyzer does not use.
164 ts = analyzer.reusableTokenStream("dummy", new StringReader("ภาษาไทย"));
165 ts.addAttribute(FlagsAttribute.class);
166 assertTokenStreamContents(ts, new String[] { "ภาษา", "ไทย" });