--- /dev/null
+package org.apache.lucene.analysis.fr;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.util.Version;
+
+/**
+ * Test case for FrenchAnalyzer.
+ *
+ * @version $version$
+ */
+
+public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
+
+ public void testAnalyzer() throws Exception {
+ FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
+
+ assertAnalyzesTo(fa, "", new String[] {
+ });
+
+ assertAnalyzesTo(
+ fa,
+ "chien chat cheval",
+ new String[] { "chien", "chat", "cheval" });
+
+ assertAnalyzesTo(
+ fa,
+ "chien CHAT CHEVAL",
+ new String[] { "chien", "chat", "cheval" });
+
+ assertAnalyzesTo(
+ fa,
+ " chien ,? + = - CHAT /: > CHEVAL",
+ new String[] { "chien", "chat", "cheval" });
+
+ assertAnalyzesTo(fa, "chien++", new String[] { "chien" });
+
+ assertAnalyzesTo(
+ fa,
+ "mot \"entreguillemet\"",
+ new String[] { "mot", "entreguillemet" });
+
+ // let's do some french specific tests now
+
+ /* 1. couldn't resist
+ I would expect this to stay one term as in French the minus
+ sign is often used for composing words */
+ assertAnalyzesTo(
+ fa,
+ "Jean-François",
+ new String[] { "jean", "françois" });
+
+ // 2. stopwords
+ assertAnalyzesTo(
+ fa,
+ "le la chien les aux chat du des à cheval",
+ new String[] { "chien", "chat", "cheval" });
+
+ // some nouns and adjectives
+ assertAnalyzesTo(
+ fa,
+ "lances chismes habitable chiste éléments captifs",
+ new String[] {
+ "lanc",
+ "chism",
+ "habit",
+ "chist",
+ "élément",
+ "captif" });
+
+ // some verbs
+ assertAnalyzesTo(
+ fa,
+ "finissions souffrirent rugissante",
+ new String[] { "fin", "souffr", "rug" });
+
+ // some everything else
+ // aujourd'hui stays one term which is OK
+ assertAnalyzesTo(
+ fa,
+ "C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
+ new String[] {
+ "c3po",
+ "aujourd'hui",
+ "oeuf",
+ "ïâöûàä",
+ "anticonstitutionnel",
+ "jav" });
+
+ // some more everything else
+ // here 1940-1945 stays as one term, 1940:1945 not ?
+ assertAnalyzesTo(
+ fa,
+ "33Bis 1940-1945 1940:1945 (---i+++)*",
+ new String[] { "33bis", "1940", "1945", "1940", "1945", "i" });
+
+ }
+
+ /**
+ * @deprecated remove this test for Lucene 4.0
+ */
+ @Deprecated
+ public void testAnalyzer30() throws Exception {
+ FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_30);
+
+ assertAnalyzesTo(fa, "", new String[] {
+ });
+
+ assertAnalyzesTo(
+ fa,
+ "chien chat cheval",
+ new String[] { "chien", "chat", "cheval" });
+
+ assertAnalyzesTo(
+ fa,
+ "chien CHAT CHEVAL",
+ new String[] { "chien", "chat", "cheval" });
+
+ assertAnalyzesTo(
+ fa,
+ " chien ,? + = - CHAT /: > CHEVAL",
+ new String[] { "chien", "chat", "cheval" });
+
+ assertAnalyzesTo(fa, "chien++", new String[] { "chien" });
+
+ assertAnalyzesTo(
+ fa,
+ "mot \"entreguillemet\"",
+ new String[] { "mot", "entreguillemet" });
+
+ // let's do some french specific tests now
+
+ /* 1. couldn't resist
+ I would expect this to stay one term as in French the minus
+ sign is often used for composing words */
+ assertAnalyzesTo(
+ fa,
+ "Jean-François",
+ new String[] { "jean", "françois" });
+
+ // 2. stopwords
+ assertAnalyzesTo(
+ fa,
+ "le la chien les aux chat du des à cheval",
+ new String[] { "chien", "chat", "cheval" });
+
+ // some nouns and adjectives
+ assertAnalyzesTo(
+ fa,
+ "lances chismes habitable chiste éléments captifs",
+ new String[] {
+ "lanc",
+ "chism",
+ "habit",
+ "chist",
+ "élément",
+ "captif" });
+
+ // some verbs
+ assertAnalyzesTo(
+ fa,
+ "finissions souffrirent rugissante",
+ new String[] { "fin", "souffr", "rug" });
+
+ // some everything else
+ // aujourd'hui stays one term which is OK
+ assertAnalyzesTo(
+ fa,
+ "C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
+ new String[] {
+ "c3po",
+ "aujourd'hui",
+ "oeuf",
+ "ïâöûàä",
+ "anticonstitutionnel",
+ "jav" });
+
+ // some more everything else
+ // here 1940-1945 stays as one term, 1940:1945 not ?
+ assertAnalyzesTo(
+ fa,
+ "33Bis 1940-1945 1940:1945 (---i+++)*",
+ new String[] { "33bis", "1940-1945", "1940", "1945", "i" });
+
+ }
+
+ public void testReusableTokenStream() throws Exception {
+ FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
+ // stopwords
+ assertAnalyzesToReuse(
+ fa,
+ "le la chien les aux chat du des à cheval",
+ new String[] { "chien", "chat", "cheval" });
+
+ // some nouns and adjectives
+ assertAnalyzesToReuse(
+ fa,
+ "lances chismes habitable chiste éléments captifs",
+ new String[] {
+ "lanc",
+ "chism",
+ "habit",
+ "chist",
+ "élément",
+ "captif" });
+ }
+
+ /*
+ * Test that changes to the exclusion table are applied immediately
+ * when using reusable token streams.
+ */
+ public void testExclusionTableReuse() throws Exception {
+ FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
+ assertAnalyzesToReuse(fa, "habitable", new String[] { "habit" });
+ fa.setStemExclusionTable(new String[] { "habitable" });
+ assertAnalyzesToReuse(fa, "habitable", new String[] { "habitable" });
+ }
+
+ public void testExclusionTableViaCtor() throws Exception {
+ CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
+ set.add("habitable");
+ FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT,
+ CharArraySet.EMPTY_SET, set);
+ assertAnalyzesToReuse(fa, "habitable chiste", new String[] { "habitable",
+ "chist" });
+
+ fa = new FrenchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
+ assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable",
+ "chist" });
+ }
+
+ public void testElision() throws Exception {
+ FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
+ assertAnalyzesTo(fa, "voir l'embrouille", new String[] { "voir", "embrouill" });
+ }
+
+ /**
+ * Prior to 3.1, this analyzer had no lowercase filter.
+ * stopwords were case sensitive. Preserve this for back compat.
+ * @deprecated Remove this test in Lucene 4.0
+ */
+ @Deprecated
+ public void testBuggyStopwordsCasing() throws IOException {
+ FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_30);
+ assertAnalyzesTo(a, "Votre", new String[] { "votr" });
+ }
+
+ /**
+ * Test that stopwords are not case sensitive
+ */
+ public void testStopwordsCasing() throws IOException {
+ FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_31);
+ assertAnalyzesTo(a, "Votre", new String[] { });
+ }
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, new FrenchAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+ }
+}