1 package org.apache.lucene.analysis.fa;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
21 import org.apache.lucene.analysis.Analyzer;
24 * Test the Persian Analyzer
27 public class TestPersianAnalyzer extends BaseTokenStreamTestCase {
30 * This test fails with NPE when the stopwords file is missing in classpath
32 public void testResourcesAvailable() {
33 new PersianAnalyzer(TEST_VERSION_CURRENT);
37 * This test shows how the combination of tokenization (breaking on zero-width
38 * non-joiner), normalization (such as treating arabic YEH and farsi YEH the
39 * same), and stopwords creates a light-stemming effect for verbs.
41 * These verb forms are from http://en.wikipedia.org/wiki/Persian_grammar
43 public void testBehaviorVerbs() throws Exception {
44 Analyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT);
45 // active present indicative
46 assertAnalyzesTo(a, "میخورد", new String[] { "خورد" });
47 // active preterite indicative
48 assertAnalyzesTo(a, "خورد", new String[] { "خورد" });
49 // active imperfective preterite indicative
50 assertAnalyzesTo(a, "میخورد", new String[] { "خورد" });
51 // active future indicative
52 assertAnalyzesTo(a, "خواهد خورد", new String[] { "خورد" });
53 // active present progressive indicative
54 assertAnalyzesTo(a, "دارد میخورد", new String[] { "خورد" });
55 // active preterite progressive indicative
56 assertAnalyzesTo(a, "داشت میخورد", new String[] { "خورد" });
58 // active perfect indicative
59 assertAnalyzesTo(a, "خوردهاست", new String[] { "خورده" });
60 // active imperfective perfect indicative
61 assertAnalyzesTo(a, "میخوردهاست", new String[] { "خورده" });
62 // active pluperfect indicative
63 assertAnalyzesTo(a, "خورده بود", new String[] { "خورده" });
64 // active imperfective pluperfect indicative
65 assertAnalyzesTo(a, "میخورده بود", new String[] { "خورده" });
66 // active preterite subjunctive
67 assertAnalyzesTo(a, "خورده باشد", new String[] { "خورده" });
68 // active imperfective preterite subjunctive
69 assertAnalyzesTo(a, "میخورده باشد", new String[] { "خورده" });
70 // active pluperfect subjunctive
71 assertAnalyzesTo(a, "خورده بوده باشد", new String[] { "خورده" });
72 // active imperfective pluperfect subjunctive
73 assertAnalyzesTo(a, "میخورده بوده باشد", new String[] { "خورده" });
74 // passive present indicative
75 assertAnalyzesTo(a, "خورده میشود", new String[] { "خورده" });
76 // passive preterite indicative
77 assertAnalyzesTo(a, "خورده شد", new String[] { "خورده" });
78 // passive imperfective preterite indicative
79 assertAnalyzesTo(a, "خورده میشد", new String[] { "خورده" });
80 // passive perfect indicative
81 assertAnalyzesTo(a, "خورده شدهاست", new String[] { "خورده" });
82 // passive imperfective perfect indicative
83 assertAnalyzesTo(a, "خورده میشدهاست", new String[] { "خورده" });
84 // passive pluperfect indicative
85 assertAnalyzesTo(a, "خورده شده بود", new String[] { "خورده" });
86 // passive imperfective pluperfect indicative
87 assertAnalyzesTo(a, "خورده میشده بود", new String[] { "خورده" });
88 // passive future indicative
89 assertAnalyzesTo(a, "خورده خواهد شد", new String[] { "خورده" });
90 // passive present progressive indicative
91 assertAnalyzesTo(a, "دارد خورده میشود", new String[] { "خورده" });
92 // passive preterite progressive indicative
93 assertAnalyzesTo(a, "داشت خورده میشد", new String[] { "خورده" });
94 // passive present subjunctive
95 assertAnalyzesTo(a, "خورده شود", new String[] { "خورده" });
96 // passive preterite subjunctive
97 assertAnalyzesTo(a, "خورده شده باشد", new String[] { "خورده" });
98 // passive imperfective preterite subjunctive
99 assertAnalyzesTo(a, "خورده میشده باشد", new String[] { "خورده" });
100 // passive pluperfect subjunctive
101 assertAnalyzesTo(a, "خورده شده بوده باشد", new String[] { "خورده" });
102 // passive imperfective pluperfect subjunctive
103 assertAnalyzesTo(a, "خورده میشده بوده باشد", new String[] { "خورده" });
105 // active present subjunctive
106 assertAnalyzesTo(a, "بخورد", new String[] { "بخورد" });
110 * This test shows how the combination of tokenization and stopwords creates a
111 * light-stemming effect for verbs.
113 * In this case, these forms are presented with alternative orthography, using
114 * arabic yeh and whitespace. This yeh phenomenon is common for legacy text
115 * due to some previous bugs in Microsoft Windows.
117 * These verb forms are from http://en.wikipedia.org/wiki/Persian_grammar
119 public void testBehaviorVerbsDefective() throws Exception {
120 Analyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT);
121 // active present indicative
122 assertAnalyzesTo(a, "مي خورد", new String[] { "خورد" });
123 // active preterite indicative
124 assertAnalyzesTo(a, "خورد", new String[] { "خورد" });
125 // active imperfective preterite indicative
126 assertAnalyzesTo(a, "مي خورد", new String[] { "خورد" });
127 // active future indicative
128 assertAnalyzesTo(a, "خواهد خورد", new String[] { "خورد" });
129 // active present progressive indicative
130 assertAnalyzesTo(a, "دارد مي خورد", new String[] { "خورد" });
131 // active preterite progressive indicative
132 assertAnalyzesTo(a, "داشت مي خورد", new String[] { "خورد" });
134 // active perfect indicative
135 assertAnalyzesTo(a, "خورده است", new String[] { "خورده" });
136 // active imperfective perfect indicative
137 assertAnalyzesTo(a, "مي خورده است", new String[] { "خورده" });
138 // active pluperfect indicative
139 assertAnalyzesTo(a, "خورده بود", new String[] { "خورده" });
140 // active imperfective pluperfect indicative
141 assertAnalyzesTo(a, "مي خورده بود", new String[] { "خورده" });
142 // active preterite subjunctive
143 assertAnalyzesTo(a, "خورده باشد", new String[] { "خورده" });
144 // active imperfective preterite subjunctive
145 assertAnalyzesTo(a, "مي خورده باشد", new String[] { "خورده" });
146 // active pluperfect subjunctive
147 assertAnalyzesTo(a, "خورده بوده باشد", new String[] { "خورده" });
148 // active imperfective pluperfect subjunctive
149 assertAnalyzesTo(a, "مي خورده بوده باشد", new String[] { "خورده" });
150 // passive present indicative
151 assertAnalyzesTo(a, "خورده مي شود", new String[] { "خورده" });
152 // passive preterite indicative
153 assertAnalyzesTo(a, "خورده شد", new String[] { "خورده" });
154 // passive imperfective preterite indicative
155 assertAnalyzesTo(a, "خورده مي شد", new String[] { "خورده" });
156 // passive perfect indicative
157 assertAnalyzesTo(a, "خورده شده است", new String[] { "خورده" });
158 // passive imperfective perfect indicative
159 assertAnalyzesTo(a, "خورده مي شده است", new String[] { "خورده" });
160 // passive pluperfect indicative
161 assertAnalyzesTo(a, "خورده شده بود", new String[] { "خورده" });
162 // passive imperfective pluperfect indicative
163 assertAnalyzesTo(a, "خورده مي شده بود", new String[] { "خورده" });
164 // passive future indicative
165 assertAnalyzesTo(a, "خورده خواهد شد", new String[] { "خورده" });
166 // passive present progressive indicative
167 assertAnalyzesTo(a, "دارد خورده مي شود", new String[] { "خورده" });
168 // passive preterite progressive indicative
169 assertAnalyzesTo(a, "داشت خورده مي شد", new String[] { "خورده" });
170 // passive present subjunctive
171 assertAnalyzesTo(a, "خورده شود", new String[] { "خورده" });
172 // passive preterite subjunctive
173 assertAnalyzesTo(a, "خورده شده باشد", new String[] { "خورده" });
174 // passive imperfective preterite subjunctive
175 assertAnalyzesTo(a, "خورده مي شده باشد", new String[] { "خورده" });
176 // passive pluperfect subjunctive
177 assertAnalyzesTo(a, "خورده شده بوده باشد", new String[] { "خورده" });
178 // passive imperfective pluperfect subjunctive
179 assertAnalyzesTo(a, "خورده مي شده بوده باشد", new String[] { "خورده" });
181 // active present subjunctive
182 assertAnalyzesTo(a, "بخورد", new String[] { "بخورد" });
186 * This test shows how the combination of tokenization (breaking on zero-width
187 * non-joiner or space) and stopwords creates a light-stemming effect for
188 * nouns, removing the plural -ha.
190 public void testBehaviorNouns() throws Exception {
191 Analyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT);
192 assertAnalyzesTo(a, "برگ ها", new String[] { "برگ" });
193 assertAnalyzesTo(a, "برگها", new String[] { "برگ" });
197 * Test showing that non-persian text is treated very much like SimpleAnalyzer
200 public void testBehaviorNonPersian() throws Exception {
201 Analyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT);
202 assertAnalyzesTo(a, "English test.", new String[] { "english", "test" });
206 * Basic test ensuring that reusableTokenStream works correctly.
208 public void testReusableTokenStream() throws Exception {
209 Analyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT);
210 assertAnalyzesToReuse(a, "خورده مي شده بوده باشد", new String[] { "خورده" });
211 assertAnalyzesToReuse(a, "برگها", new String[] { "برگ" });
215 * Test that custom stopwords work, and are not case-sensitive.
217 public void testCustomStopwords() throws Exception {
218 PersianAnalyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT, new String[] { "the", "and", "a" });
219 assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
223 /** blast some random strings through the analyzer */
224 public void testRandomStrings() throws Exception {
225 checkRandomData(random, new PersianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);