lucene-java-3.5.0/lucene/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java

   1 /**
   2  * Licensed to the Apache Software Foundation (ASF) under one or more
   3  * contributor license agreements.  See the NOTICE file distributed with
   4  * this work for additional information regarding copyright ownership.
   5  * The ASF licenses this file to You under the Apache License, Version 2.0
   6  * (the "License"); you may not use this file except in compliance with
   7  * the License.  You may obtain a copy of the License at
   8  *
   9  *     http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 package org.apache.lucene.analysis.cn.smart;
  19
  20 import java.io.StringReader;
  21
  22 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
  23 import org.apache.lucene.analysis.Analyzer;
  24 import org.apache.lucene.analysis.TokenStream;
  25 import org.apache.lucene.util.Version;
  26
  27 public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
  28
  29   public void testChineseStopWordsDefault() throws Exception {
  30     Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT); /* will load stopwords */
  31     String sentence = "我购买了道具和服装。";
  32     String result[] = { "我", "购买", "了", "道具", "和", "服装" };
  33     assertAnalyzesTo(ca, sentence, result);
  34     // set stop-words from the outer world - must yield same behavior
  35     ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT, SmartChineseAnalyzer.getDefaultStopSet());
  36     assertAnalyzesTo(ca, sentence, result);
  37   }
  38
  39   /*
  40    * This test is the same as the above, except with two phrases.
  41    * This tests to ensure the SentenceTokenizer->WordTokenFilter chain works correctly.
  42    */
  43   public void testChineseStopWordsDefaultTwoPhrases() throws Exception {
  44     Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT); /* will load stopwords */
  45     String sentence = "我购买了道具和服装。 我购买了道具和服装。";
  46     String result[] = { "我", "购买", "了", "道具", "和", "服装", "我", "购买", "了", "道具", "和", "服装" };
  47     assertAnalyzesTo(ca, sentence, result);
  48   }
  49
  50   /*
  51    * This test is the same as the above, except using an ideographic space as a separator.
  52    * This tests to ensure the stopwords are working correctly.
  53    */
  54   public void testChineseStopWordsDefaultTwoPhrasesIdeoSpace() throws Exception {
  55     Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT); /* will load stopwords */
  56     String sentence = "我购买了道具和服装　我购买了道具和服装。";
  57     String result[] = { "我", "购买", "了", "道具", "和", "服装", "我", "购买", "了", "道具", "和", "服装" };
  58     assertAnalyzesTo(ca, sentence, result);
  59   }
  60
  61   /*
  62    * Punctuation is handled in a strange way if you disable stopwords
  63    * In this example the IDEOGRAPHIC FULL STOP is converted into a comma.
  64    * if you don't supply (true) to the constructor, or use a different stopwords list,
  65    * then punctuation is indexed.
  66    */
  67   public void testChineseStopWordsOff() throws Exception {
  68     Analyzer[] analyzers = new Analyzer[] {
  69       new SmartChineseAnalyzer(Version.LUCENE_CURRENT, false),/* doesn't load stopwords */
  70       new SmartChineseAnalyzer(Version.LUCENE_CURRENT, null) /* sets stopwords to empty set */};
  71     String sentence = "我购买了道具和服装。";
  72     String result[] = { "我", "购买", "了", "道具", "和", "服装", "," };
  73     for (Analyzer analyzer : analyzers) {
  74       assertAnalyzesTo(analyzer, sentence, result);
  75       assertAnalyzesToReuse(analyzer, sentence, result);
  76     }
  77   }
  78
  79   /*
  80    * Check that position increments after stopwords are correct,
  81    * when stopfilter is configured with enablePositionIncrements
  82    */
  83   public void testChineseStopWords2() throws Exception {
  84     Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT); /* will load stopwords */
  85     String sentence = "Title:San"; // : is a stopword
  86     String result[] = { "titl", "san"};
  87     int startOffsets[] = { 0, 6 };
  88     int endOffsets[] = { 5, 9 };
  89     int posIncr[] = { 1, 2 };
  90     assertAnalyzesTo(ca, sentence, result, startOffsets, endOffsets, posIncr);
  91   }
  92
  93   public void testChineseAnalyzer() throws Exception {
  94     Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true);
  95     String sentence = "我购买了道具和服装。";
  96     String[] result = { "我", "购买", "了", "道具", "和", "服装" };
  97     assertAnalyzesTo(ca, sentence, result);
  98   }
  99
 100   /*
 101    * English words are lowercased and porter-stemmed.
 102    */
 103   public void testMixedLatinChinese() throws Exception {
 104     assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true), "我购买 Tests 了道具和服装",
 105         new String[] { "我", "购买", "test", "了", "道具", "和", "服装"});
 106   }
 107
 108   /*
 109    * Numerics are parsed as their own tokens
 110    */
 111   public void testNumerics() throws Exception {
 112     assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true), "我购买 Tests 了道具和服装1234",
 113       new String[] { "我", "购买", "test", "了", "道具", "和", "服装", "1234"});
 114   }
 115
 116   /*
 117    * Full width alphas and numerics are folded to half-width
 118    */
 119   public void testFullWidth() throws Exception {
 120     assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true), "我购买 Ｔｅｓｔｓ 了道具和服装１２３４",
 121         new String[] { "我", "购买", "test", "了", "道具", "和", "服装", "1234"});
 122   }
 123
 124   /*
 125    * Presentation form delimiters are removed
 126    */
 127   public void testDelimiters() throws Exception {
 128     assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true), "我购买︱ Tests 了道具和服装",
 129         new String[] { "我", "购买", "test", "了", "道具", "和", "服装"});
 130   }
 131
 132   /*
 133    * Text from writing systems other than Chinese and Latin are parsed as individual characters.
 134    * (regardless of Unicode category)
 135    */
 136   public void testNonChinese() throws Exception {
 137     assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true), "我购买 روبرتTests 了道具和服装",
 138         new String[] { "我", "购买", "ر", "و", "ب", "ر", "ت", "test", "了", "道具", "和", "服装"});
 139   }
 140
 141   /*
 142    * Test what the analyzer does with out-of-vocabulary words.
 143    * In this case the name is Yousaf Raza Gillani.
 144    * Currently it is being analyzed into single characters...
 145    */
 146   public void testOOV() throws Exception {
 147     assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true), "优素福·拉扎·吉拉尼",
 148       new String[] { "优", "素", "福", "拉", "扎", "吉", "拉", "尼" });
 149
 150     assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true), "优素福拉扎吉拉尼",
 151       new String[] { "优", "素", "福", "拉", "扎", "吉", "拉", "尼" });
 152   }
 153
 154   public void testOffsets() throws Exception {
 155     assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true), "我购买了道具和服装",
 156         new String[] { "我", "购买", "了", "道具", "和", "服装" },
 157         new int[] { 0, 1, 3, 4, 6, 7 },
 158         new int[] { 1, 3, 4, 6, 7, 9 });
 159   }
 160
 161   public void testReusableTokenStream() throws Exception {
 162     Analyzer a = new SmartChineseAnalyzer(Version.LUCENE_CURRENT);
 163     assertAnalyzesToReuse(a, "我购买 Tests 了道具和服装",
 164         new String[] { "我", "购买", "test", "了", "道具", "和", "服装"},
 165         new int[] { 0, 1, 4, 10, 11, 13, 14 },
 166         new int[] { 1, 3, 9, 11, 13, 14, 16 });
 167     assertAnalyzesToReuse(a, "我购买了道具和服装。",
 168         new String[] { "我", "购买", "了", "道具", "和", "服装" },
 169         new int[] { 0, 1, 3, 4, 6, 7 },
 170         new int[] { 1, 3, 4, 6, 7, 9 });
 171   }
 172
 173   // LUCENE-3026
 174   public void testLargeDocument() throws Exception {
 175     StringBuilder sb = new StringBuilder();
 176     for (int i = 0; i < 5000; i++) {
 177       sb.append("我购买了道具和服装。");
 178     }
 179     Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
 180     TokenStream stream = analyzer.reusableTokenStream("", new StringReader(sb.toString()));
 181     stream.reset();
 182     while (stream.incrementToken()) {
 183     }
 184   }
 185
 186   // LUCENE-3026
 187   public void testLargeSentence() throws Exception {
 188     StringBuilder sb = new StringBuilder();
 189     for (int i = 0; i < 5000; i++) {
 190       sb.append("我购买了道具和服装");
 191     }
 192     Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
 193     TokenStream stream = analyzer.reusableTokenStream("", new StringReader(sb.toString()));
 194     stream.reset();
 195     while (stream.incrementToken()) {
 196     }
 197   }
 198
 199   /** blast some random strings through the analyzer */
 200   public void testRandomStrings() throws Exception {
 201     checkRandomData(random, new SmartChineseAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
 202   }
 203 }