1 package org.apache.lucene.analysis.fr;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
22 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
23 import org.apache.lucene.analysis.CharArraySet;
24 import org.apache.lucene.util.Version;
27 * Test case for FrenchAnalyzer.
32 public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
34 public void testAnalyzer() throws Exception {
35 FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
37 assertAnalyzesTo(fa, "", new String[] {
43 new String[] { "chien", "chat", "cheval" });
48 new String[] { "chien", "chat", "cheval" });
52 " chien ,? + = - CHAT /: > CHEVAL",
53 new String[] { "chien", "chat", "cheval" });
55 assertAnalyzesTo(fa, "chien++", new String[] { "chien" });
59 "mot \"entreguillemet\"",
60 new String[] { "mot", "entreguillemet" });
62 // let's do some french specific tests now
65 I would expect this to stay one term as in French the minus
66 sign is often used for composing words */
70 new String[] { "jean", "françois" });
75 "le la chien les aux chat du des à cheval",
76 new String[] { "chien", "chat", "cheval" });
78 // some nouns and adjectives
81 "lances chismes habitable chiste éléments captifs",
93 "finissions souffrirent rugissante",
94 new String[] { "fin", "souffr", "rug" });
96 // some everything else
97 // aujourd'hui stays one term which is OK
100 "C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
106 "anticonstitutionnel",
109 // some more everything else
110 // here 1940-1945 stays as one term, 1940:1945 not ?
113 "33Bis 1940-1945 1940:1945 (---i+++)*",
114 new String[] { "33bis", "1940", "1945", "1940", "1945", "i" });
119 * @deprecated remove this test for Lucene 4.0
122 public void testAnalyzer30() throws Exception {
123 FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_30);
125 assertAnalyzesTo(fa, "", new String[] {
131 new String[] { "chien", "chat", "cheval" });
136 new String[] { "chien", "chat", "cheval" });
140 " chien ,? + = - CHAT /: > CHEVAL",
141 new String[] { "chien", "chat", "cheval" });
143 assertAnalyzesTo(fa, "chien++", new String[] { "chien" });
147 "mot \"entreguillemet\"",
148 new String[] { "mot", "entreguillemet" });
150 // let's do some french specific tests now
152 /* 1. couldn't resist
153 I would expect this to stay one term as in French the minus
154 sign is often used for composing words */
158 new String[] { "jean", "françois" });
163 "le la chien les aux chat du des à cheval",
164 new String[] { "chien", "chat", "cheval" });
166 // some nouns and adjectives
169 "lances chismes habitable chiste éléments captifs",
181 "finissions souffrirent rugissante",
182 new String[] { "fin", "souffr", "rug" });
184 // some everything else
185 // aujourd'hui stays one term which is OK
188 "C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
194 "anticonstitutionnel",
197 // some more everything else
198 // here 1940-1945 stays as one term, 1940:1945 not ?
201 "33Bis 1940-1945 1940:1945 (---i+++)*",
202 new String[] { "33bis", "1940-1945", "1940", "1945", "i" });
206 public void testReusableTokenStream() throws Exception {
207 FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
209 assertAnalyzesToReuse(
211 "le la chien les aux chat du des à cheval",
212 new String[] { "chien", "chat", "cheval" });
214 // some nouns and adjectives
215 assertAnalyzesToReuse(
217 "lances chismes habitable chiste éléments captifs",
228 * Test that changes to the exclusion table are applied immediately
229 * when using reusable token streams.
231 public void testExclusionTableReuse() throws Exception {
232 FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
233 assertAnalyzesToReuse(fa, "habitable", new String[] { "habit" });
234 fa.setStemExclusionTable(new String[] { "habitable" });
235 assertAnalyzesToReuse(fa, "habitable", new String[] { "habitable" });
238 public void testExclusionTableViaCtor() throws Exception {
239 CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
240 set.add("habitable");
241 FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT,
242 CharArraySet.EMPTY_SET, set);
243 assertAnalyzesToReuse(fa, "habitable chiste", new String[] { "habitable",
246 fa = new FrenchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
247 assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable",
251 public void testElision() throws Exception {
252 FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
253 assertAnalyzesTo(fa, "voir l'embrouille", new String[] { "voir", "embrouill" });
257 * Prior to 3.1, this analyzer had no lowercase filter.
258 * stopwords were case sensitive. Preserve this for back compat.
259 * @deprecated Remove this test in Lucene 4.0
262 public void testBuggyStopwordsCasing() throws IOException {
263 FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_30);
264 assertAnalyzesTo(a, "Votre", new String[] { "votr" });
268 * Test that stopwords are not case sensitive
270 public void testStopwordsCasing() throws IOException {
271 FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_31);
272 assertAnalyzesTo(a, "Votre", new String[] { });
275 /** blast some random strings through the analyzer */
276 public void testRandomStrings() throws Exception {
277 checkRandomData(random, new FrenchAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);