2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 package org.apache.lucene.analysis.cn.smart;
20 import java.io.StringReader;
22 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
23 import org.apache.lucene.analysis.Analyzer;
24 import org.apache.lucene.analysis.TokenStream;
25 import org.apache.lucene.util.Version;
27 public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
29 public void testChineseStopWordsDefault() throws Exception {
30 Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT); /* will load stopwords */
31 String sentence = "我购买了道具和服装。";
32 String result[] = { "我", "购买", "了", "道具", "和", "服装" };
33 assertAnalyzesTo(ca, sentence, result);
34 // set stop-words from the outer world - must yield same behavior
35 ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT, SmartChineseAnalyzer.getDefaultStopSet());
36 assertAnalyzesTo(ca, sentence, result);
40 * This test is the same as the above, except with two phrases.
41 * This tests to ensure the SentenceTokenizer->WordTokenFilter chain works correctly.
43 public void testChineseStopWordsDefaultTwoPhrases() throws Exception {
44 Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT); /* will load stopwords */
45 String sentence = "我购买了道具和服装。 我购买了道具和服装。";
46 String result[] = { "我", "购买", "了", "道具", "和", "服装", "我", "购买", "了", "道具", "和", "服装" };
47 assertAnalyzesTo(ca, sentence, result);
51 * This test is the same as the above, except using an ideographic space as a separator.
52 * This tests to ensure the stopwords are working correctly.
54 public void testChineseStopWordsDefaultTwoPhrasesIdeoSpace() throws Exception {
55 Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT); /* will load stopwords */
56 String sentence = "我购买了道具和服装 我购买了道具和服装。";
57 String result[] = { "我", "购买", "了", "道具", "和", "服装", "我", "购买", "了", "道具", "和", "服装" };
58 assertAnalyzesTo(ca, sentence, result);
62 * Punctuation is handled in a strange way if you disable stopwords
63 * In this example the IDEOGRAPHIC FULL STOP is converted into a comma.
64 * if you don't supply (true) to the constructor, or use a different stopwords list,
65 * then punctuation is indexed.
67 public void testChineseStopWordsOff() throws Exception {
68 Analyzer[] analyzers = new Analyzer[] {
69 new SmartChineseAnalyzer(Version.LUCENE_CURRENT, false),/* doesn't load stopwords */
70 new SmartChineseAnalyzer(Version.LUCENE_CURRENT, null) /* sets stopwords to empty set */};
71 String sentence = "我购买了道具和服装。";
72 String result[] = { "我", "购买", "了", "道具", "和", "服装", "," };
73 for (Analyzer analyzer : analyzers) {
74 assertAnalyzesTo(analyzer, sentence, result);
75 assertAnalyzesToReuse(analyzer, sentence, result);
80 * Check that position increments after stopwords are correct,
81 * when stopfilter is configured with enablePositionIncrements
83 public void testChineseStopWords2() throws Exception {
84 Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT); /* will load stopwords */
85 String sentence = "Title:San"; // : is a stopword
86 String result[] = { "titl", "san"};
87 int startOffsets[] = { 0, 6 };
88 int endOffsets[] = { 5, 9 };
89 int posIncr[] = { 1, 2 };
90 assertAnalyzesTo(ca, sentence, result, startOffsets, endOffsets, posIncr);
93 public void testChineseAnalyzer() throws Exception {
94 Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true);
95 String sentence = "我购买了道具和服装。";
96 String[] result = { "我", "购买", "了", "道具", "和", "服装" };
97 assertAnalyzesTo(ca, sentence, result);
101 * English words are lowercased and porter-stemmed.
103 public void testMixedLatinChinese() throws Exception {
104 assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true), "我购买 Tests 了道具和服装",
105 new String[] { "我", "购买", "test", "了", "道具", "和", "服装"});
109 * Numerics are parsed as their own tokens
111 public void testNumerics() throws Exception {
112 assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true), "我购买 Tests 了道具和服装1234",
113 new String[] { "我", "购买", "test", "了", "道具", "和", "服装", "1234"});
117 * Full width alphas and numerics are folded to half-width
119 public void testFullWidth() throws Exception {
120 assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true), "我购买 Tests 了道具和服装1234",
121 new String[] { "我", "购买", "test", "了", "道具", "和", "服装", "1234"});
125 * Presentation form delimiters are removed
127 public void testDelimiters() throws Exception {
128 assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true), "我购买︱ Tests 了道具和服装",
129 new String[] { "我", "购买", "test", "了", "道具", "和", "服装"});
133 * Text from writing systems other than Chinese and Latin are parsed as individual characters.
134 * (regardless of Unicode category)
136 public void testNonChinese() throws Exception {
137 assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true), "我购买 روبرتTests 了道具和服装",
138 new String[] { "我", "购买", "ر", "و", "ب", "ر", "ت", "test", "了", "道具", "和", "服装"});
142 * Test what the analyzer does with out-of-vocabulary words.
143 * In this case the name is Yousaf Raza Gillani.
144 * Currently it is being analyzed into single characters...
146 public void testOOV() throws Exception {
147 assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true), "优素福·拉扎·吉拉尼",
148 new String[] { "优", "素", "福", "拉", "扎", "吉", "拉", "尼" });
150 assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true), "优素福拉扎吉拉尼",
151 new String[] { "优", "素", "福", "拉", "扎", "吉", "拉", "尼" });
154 public void testOffsets() throws Exception {
155 assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true), "我购买了道具和服装",
156 new String[] { "我", "购买", "了", "道具", "和", "服装" },
157 new int[] { 0, 1, 3, 4, 6, 7 },
158 new int[] { 1, 3, 4, 6, 7, 9 });
161 public void testReusableTokenStream() throws Exception {
162 Analyzer a = new SmartChineseAnalyzer(Version.LUCENE_CURRENT);
163 assertAnalyzesToReuse(a, "我购买 Tests 了道具和服装",
164 new String[] { "我", "购买", "test", "了", "道具", "和", "服装"},
165 new int[] { 0, 1, 4, 10, 11, 13, 14 },
166 new int[] { 1, 3, 9, 11, 13, 14, 16 });
167 assertAnalyzesToReuse(a, "我购买了道具和服装。",
168 new String[] { "我", "购买", "了", "道具", "和", "服装" },
169 new int[] { 0, 1, 3, 4, 6, 7 },
170 new int[] { 1, 3, 4, 6, 7, 9 });
174 public void testLargeDocument() throws Exception {
175 StringBuilder sb = new StringBuilder();
176 for (int i = 0; i < 5000; i++) {
177 sb.append("我购买了道具和服装。");
179 Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
180 TokenStream stream = analyzer.reusableTokenStream("", new StringReader(sb.toString()));
182 while (stream.incrementToken()) {
187 public void testLargeSentence() throws Exception {
188 StringBuilder sb = new StringBuilder();
189 for (int i = 0; i < 5000; i++) {
190 sb.append("我购买了道具和服装");
192 Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
193 TokenStream stream = analyzer.reusableTokenStream("", new StringReader(sb.toString()));
195 while (stream.incrementToken()) {
199 /** blast some random strings through the analyzer */
200 public void testRandomStrings() throws Exception {
201 checkRandomData(random, new SmartChineseAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);