lucene-java-3.4.0/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java

   1 package org.apache.lucene.analysis.th;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.StringReader;
  21
  22 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
  23 import org.apache.lucene.analysis.TokenStream;
  24 import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
  25 import org.apache.lucene.util.Version;
  26
  27 /**
  28  * Test case for ThaiAnalyzer, modified from TestFrenchAnalyzer
  29  *
  30  * @version   0.1
  31  */
  32
  33 public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
  34
  35         /*
  36          * testcase for offsets
  37          */
  38         public void testOffsets() throws Exception {
  39           assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
  40                 assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี",
  41                     new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
  42                                 new int[] { 0, 3, 6, 9, 13, 17, 20, 23 },
  43                                 new int[] { 3, 6, 9, 13, 17, 20, 23, 25 });
  44         }
  45
  46         public void testTokenType() throws Exception {
  47             assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
  48       assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี ๑๒๓",
  49                        new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" },
  50                        new String[] { "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
  51                                       "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
  52                                       "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
  53                                       "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
  54                                       "<NUM>" });
  55         }
  56
  57         /**
  58          * Thai numeric tokens were typed as <ALPHANUM> instead of <NUM>.
  59          * @deprecated testing backwards behavior
  60          */
  61         @Deprecated
  62         public void testBuggyTokenType30() throws Exception {
  63           assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
  64                 assertAnalyzesTo(new ThaiAnalyzer(Version.LUCENE_30), "การที่ได้ต้องแสดงว่างานดี ๑๒๓",
  65                          new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" },
  66                          new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
  67                                         "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
  68                                         "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" });
  69         }
  70
  71         /** @deprecated testing backwards behavior */
  72         @Deprecated
  73     public void testAnalyzer30() throws Exception {
  74           assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
  75         ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_30);
  76
  77                 assertAnalyzesTo(analyzer, "", new String[] {});
  78
  79                 assertAnalyzesTo(
  80                         analyzer,
  81                         "การที่ได้ต้องแสดงว่างานดี",
  82                         new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
  83
  84                 assertAnalyzesTo(
  85                         analyzer,
  86                         "บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
  87                         new String[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
  88
  89     // English stop words
  90                 assertAnalyzesTo(
  91                         analyzer,
  92                         "ประโยคว่า The quick brown fox jumped over the lazy dogs",
  93                         new String[] { "ประโยค", "ว่า", "quick", "brown", "fox", "jumped", "over", "lazy", "dogs" });
  94         }
  95
  96         /*
  97          * Test that position increments are adjusted correctly for stopwords.
  98          */
  99         public void testPositionIncrements() throws Exception {
 100           assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
 101           ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
 102
 103     assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้อง the แสดงว่างานดี",
 104         new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
 105         new int[] { 0, 3, 6, 9, 18, 22, 25, 28 },
 106         new int[] { 3, 6, 9, 13, 22, 25, 28, 30 },
 107         new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });
 108
 109           // case that a stopword is adjacent to thai text, with no whitespace
 110     assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องthe แสดงว่างานดี",
 111         new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
 112         new int[] { 0, 3, 6, 9, 17, 21, 24, 27 },
 113         new int[] { 3, 6, 9, 13, 21, 24, 27, 29 },
 114         new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });
 115         }
 116
 117         public void testReusableTokenStream() throws Exception {
 118           assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
 119           ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
 120           assertAnalyzesToReuse(analyzer, "", new String[] {});
 121
 122       assertAnalyzesToReuse(
 123           analyzer,
 124           "การที่ได้ต้องแสดงว่างานดี",
 125           new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
 126
 127       assertAnalyzesToReuse(
 128           analyzer,
 129           "บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
 130           new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz", "demo.com" });
 131         }
 132
 133         /** @deprecated, for version back compat */
 134         @Deprecated
 135         public void testReusableTokenStream30() throws Exception {
 136             assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
 137             ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_30);
 138             assertAnalyzesToReuse(analyzer, "", new String[] {});
 139
 140             assertAnalyzesToReuse(
 141             analyzer,
 142             "การที่ได้ต้องแสดงว่างานดี",
 143             new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
 144
 145             assertAnalyzesToReuse(
 146             analyzer,
 147             "บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
 148             new String[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
 149   }
 150
 151   /** blast some random strings through the analyzer */
 152   public void testRandomStrings() throws Exception {
 153     checkRandomData(random, new ThaiAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
 154   }
 155
 156   // LUCENE-3044
 157   public void testAttributeReuse() throws Exception {
 158     assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
 159     ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_30);
 160     // just consume
 161     TokenStream ts = analyzer.reusableTokenStream("dummy", new StringReader("ภาษาไทย"));
 162     assertTokenStreamContents(ts, new String[] { "ภาษา", "ไทย" });
 163     // this consumer adds flagsAtt, which this analyzer does not use.
 164     ts = analyzer.reusableTokenStream("dummy", new StringReader("ภาษาไทย"));
 165     ts.addAttribute(FlagsAttribute.class);
 166     assertTokenStreamContents(ts, new String[] { "ภาษา", "ไทย" });
 167   }
 168 }