--- /dev/null
+package org.apache.lucene.analysis.th;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.util.Version;
+
+/**
+ * Test case for ThaiAnalyzer, modified from TestFrenchAnalyzer
+ *
+ * @version 0.1
+ */
+
+public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
+
+ /*
+ * testcase for offsets
+ */
+ public void testOffsets() throws Exception {
+ assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
+ assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี",
+ new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
+ new int[] { 0, 3, 6, 9, 13, 17, 20, 23 },
+ new int[] { 3, 6, 9, 13, 17, 20, 23, 25 });
+ }
+
+ public void testTokenType() throws Exception {
+ assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
+ assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี ๑๒๓",
+ new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" },
+ new String[] { "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
+ "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
+ "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
+ "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
+ "<NUM>" });
+ }
+
+ /**
+ * Thai numeric tokens were typed as <ALPHANUM> instead of <NUM>.
+ * @deprecated testing backwards behavior
+ */
+ @Deprecated
+ public void testBuggyTokenType30() throws Exception {
+ assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
+ assertAnalyzesTo(new ThaiAnalyzer(Version.LUCENE_30), "การที่ได้ต้องแสดงว่างานดี ๑๒๓",
+ new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" },
+ new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
+ "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
+ "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" });
+ }
+
+ /** @deprecated testing backwards behavior */
+ @Deprecated
+ public void testAnalyzer30() throws Exception {
+ assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
+ ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_30);
+
+ assertAnalyzesTo(analyzer, "", new String[] {});
+
+ assertAnalyzesTo(
+ analyzer,
+ "การที่ได้ต้องแสดงว่างานดี",
+ new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
+
+ assertAnalyzesTo(
+ analyzer,
+ "บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
+ new String[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
+
+ // English stop words
+ assertAnalyzesTo(
+ analyzer,
+ "ประโยคว่า The quick brown fox jumped over the lazy dogs",
+ new String[] { "ประโยค", "ว่า", "quick", "brown", "fox", "jumped", "over", "lazy", "dogs" });
+ }
+
+ /*
+ * Test that position increments are adjusted correctly for stopwords.
+ */
+ public void testPositionIncrements() throws Exception {
+ assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
+ ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
+
+ assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้อง the แสดงว่างานดี",
+ new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
+ new int[] { 0, 3, 6, 9, 18, 22, 25, 28 },
+ new int[] { 3, 6, 9, 13, 22, 25, 28, 30 },
+ new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });
+
+ // case that a stopword is adjacent to thai text, with no whitespace
+ assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องthe แสดงว่างานดี",
+ new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
+ new int[] { 0, 3, 6, 9, 17, 21, 24, 27 },
+ new int[] { 3, 6, 9, 13, 21, 24, 27, 29 },
+ new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });
+ }
+
+ public void testReusableTokenStream() throws Exception {
+ assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
+ ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
+ assertAnalyzesToReuse(analyzer, "", new String[] {});
+
+ assertAnalyzesToReuse(
+ analyzer,
+ "การที่ได้ต้องแสดงว่างานดี",
+ new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
+
+ assertAnalyzesToReuse(
+ analyzer,
+ "บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
+ new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz", "demo.com" });
+ }
+
+ /** @deprecated, for version back compat */
+ @Deprecated
+ public void testReusableTokenStream30() throws Exception {
+ assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
+ ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_30);
+ assertAnalyzesToReuse(analyzer, "", new String[] {});
+
+ assertAnalyzesToReuse(
+ analyzer,
+ "การที่ได้ต้องแสดงว่างานดี",
+ new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
+
+ assertAnalyzesToReuse(
+ analyzer,
+ "บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
+ new String[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
+ }
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, new ThaiAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+ }
+
+ // LUCENE-3044
+ public void testAttributeReuse() throws Exception {
+ assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
+ ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_30);
+ // just consume
+ TokenStream ts = analyzer.reusableTokenStream("dummy", new StringReader("ภาษาไทย"));
+ assertTokenStreamContents(ts, new String[] { "ภาษา", "ไทย" });
+ // this consumer adds flagsAtt, which this analyzer does not use.
+ ts = analyzer.reusableTokenStream("dummy", new StringReader("ภาษาไทย"));
+ ts.addAttribute(FlagsAttribute.class);
+ assertTokenStreamContents(ts, new String[] { "ภาษา", "ไทย" });
+ }
+}