pylucene 3.5.0-3
[pylucene.git] / lucene-java-3.5.0 / lucene / contrib / analyzers / common / src / test / org / apache / lucene / analysis / fr / TestFrenchAnalyzer.java
diff --git a/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
new file mode 100644 (file)
index 0000000..e36c497
--- /dev/null
@@ -0,0 +1,279 @@
+package org.apache.lucene.analysis.fr;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.util.Version;
+
+/**
+ * Test case for FrenchAnalyzer.
+ *
+ * @version   $version$
+ */
+
+public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
+
+       public void testAnalyzer() throws Exception {
+               FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
+       
+               assertAnalyzesTo(fa, "", new String[] {
+               });
+
+               assertAnalyzesTo(
+                       fa,
+                       "chien chat cheval",
+                       new String[] { "chien", "chat", "cheval" });
+
+               assertAnalyzesTo(
+                       fa,
+                       "chien CHAT CHEVAL",
+                       new String[] { "chien", "chat", "cheval" });
+
+               assertAnalyzesTo(
+                       fa,
+                       "  chien  ,? + = -  CHAT /: > CHEVAL",
+                       new String[] { "chien", "chat", "cheval" });
+
+               assertAnalyzesTo(fa, "chien++", new String[] { "chien" });
+
+               assertAnalyzesTo(
+                       fa,
+                       "mot \"entreguillemet\"",
+                       new String[] { "mot", "entreguillemet" });
+
+               // let's do some french specific tests now      
+
+               /* 1. couldn't resist
+                I would expect this to stay one term as in French the minus 
+               sign is often used for composing words */
+               assertAnalyzesTo(
+                       fa,
+                       "Jean-François",
+                       new String[] { "jean", "françois" });
+
+               // 2. stopwords
+               assertAnalyzesTo(
+                       fa,
+                       "le la chien les aux chat du des à cheval",
+                       new String[] { "chien", "chat", "cheval" });
+
+               // some nouns and adjectives
+               assertAnalyzesTo(
+                       fa,
+                       "lances chismes habitable chiste éléments captifs",
+                       new String[] {
+                               "lanc",
+                               "chism",
+                               "habit",
+                               "chist",
+                               "élément",
+                               "captif" });
+
+               // some verbs
+               assertAnalyzesTo(
+                       fa,
+                       "finissions souffrirent rugissante",
+                       new String[] { "fin", "souffr", "rug" });
+
+               // some everything else
+               // aujourd'hui stays one term which is OK
+               assertAnalyzesTo(
+                       fa,
+                       "C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
+                       new String[] {
+                               "c3po",
+                               "aujourd'hui",
+                               "oeuf",
+                               "ïâöûàä",
+                               "anticonstitutionnel",
+                               "jav" });
+
+               // some more everything else
+               // here 1940-1945 stays as one term, 1940:1945 not ?
+               assertAnalyzesTo(
+                       fa,
+                       "33Bis 1940-1945 1940:1945 (---i+++)*",
+                       new String[] { "33bis", "1940", "1945", "1940", "1945", "i" });
+
+       }
+       
+       /**
+        * @deprecated remove this test for Lucene 4.0
+        */
+       @Deprecated
+       public void testAnalyzer30() throws Exception {
+           FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_30);
+         
+           assertAnalyzesTo(fa, "", new String[] {
+           });
+
+           assertAnalyzesTo(
+             fa,
+             "chien chat cheval",
+             new String[] { "chien", "chat", "cheval" });
+
+           assertAnalyzesTo(
+             fa,
+             "chien CHAT CHEVAL",
+             new String[] { "chien", "chat", "cheval" });
+
+           assertAnalyzesTo(
+             fa,
+             "  chien  ,? + = -  CHAT /: > CHEVAL",
+             new String[] { "chien", "chat", "cheval" });
+
+           assertAnalyzesTo(fa, "chien++", new String[] { "chien" });
+
+           assertAnalyzesTo(
+             fa,
+             "mot \"entreguillemet\"",
+             new String[] { "mot", "entreguillemet" });
+
+           // let's do some french specific tests now  
+
+           /* 1. couldn't resist
+            I would expect this to stay one term as in French the minus 
+           sign is often used for composing words */
+           assertAnalyzesTo(
+             fa,
+             "Jean-François",
+             new String[] { "jean", "françois" });
+
+           // 2. stopwords
+           assertAnalyzesTo(
+             fa,
+             "le la chien les aux chat du des à cheval",
+             new String[] { "chien", "chat", "cheval" });
+
+           // some nouns and adjectives
+           assertAnalyzesTo(
+             fa,
+             "lances chismes habitable chiste éléments captifs",
+             new String[] {
+               "lanc",
+               "chism",
+               "habit",
+               "chist",
+               "élément",
+               "captif" });
+
+           // some verbs
+           assertAnalyzesTo(
+             fa,
+             "finissions souffrirent rugissante",
+             new String[] { "fin", "souffr", "rug" });
+
+           // some everything else
+           // aujourd'hui stays one term which is OK
+           assertAnalyzesTo(
+             fa,
+             "C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
+             new String[] {
+               "c3po",
+               "aujourd'hui",
+               "oeuf",
+               "ïâöûàä",
+               "anticonstitutionnel",
+               "jav" });
+
+           // some more everything else
+           // here 1940-1945 stays as one term, 1940:1945 not ?
+           assertAnalyzesTo(
+             fa,
+             "33Bis 1940-1945 1940:1945 (---i+++)*",
+             new String[] { "33bis", "1940-1945", "1940", "1945", "i" });
+
+         }
+       
+       public void testReusableTokenStream() throws Exception {
+         FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
+         // stopwords
+      assertAnalyzesToReuse(
+          fa,
+          "le la chien les aux chat du des à cheval",
+          new String[] { "chien", "chat", "cheval" });
+
+      // some nouns and adjectives
+      assertAnalyzesToReuse(
+          fa,
+          "lances chismes habitable chiste éléments captifs",
+          new String[] {
+              "lanc",
+              "chism",
+              "habit",
+              "chist",
+              "élément",
+              "captif" });
+       }
+
+       /* 
+        * Test that changes to the exclusion table are applied immediately
+        * when using reusable token streams.
+        */
+       public void testExclusionTableReuse() throws Exception {
+         FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
+         assertAnalyzesToReuse(fa, "habitable", new String[] { "habit" });
+         fa.setStemExclusionTable(new String[] { "habitable" });
+         assertAnalyzesToReuse(fa, "habitable", new String[] { "habitable" });
+       }
+       
+  public void testExclusionTableViaCtor() throws Exception {
+    CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
+    set.add("habitable");
+    FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT,
+        CharArraySet.EMPTY_SET, set);
+    assertAnalyzesToReuse(fa, "habitable chiste", new String[] { "habitable",
+        "chist" });
+
+    fa = new FrenchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
+    assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable",
+        "chist" });
+  }
+  
+  public void testElision() throws Exception {
+    FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
+    assertAnalyzesTo(fa, "voir l'embrouille", new String[] { "voir", "embrouill" });
+  }
+  
+  /**
+   * Prior to 3.1, this analyzer had no lowercase filter.
+   * stopwords were case sensitive. Preserve this for back compat.
+   * @deprecated Remove this test in Lucene 4.0
+   */
+  @Deprecated
+  public void testBuggyStopwordsCasing() throws IOException {
+    FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_30);
+    assertAnalyzesTo(a, "Votre", new String[] { "votr" });
+  }
+  
+  /**
+   * Test that stopwords are not case sensitive
+   */
+  public void testStopwordsCasing() throws IOException {
+    FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_31);
+    assertAnalyzesTo(a, "Votre", new String[] { });
+  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, new FrenchAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+  }
+}