lucene-java-3.5.0/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java

   1 package org.apache.lucene.analysis.fr;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21
  22 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
  23 import org.apache.lucene.analysis.CharArraySet;
  24 import org.apache.lucene.util.Version;
  25
  26 /**
  27  * Test case for FrenchAnalyzer.
  28  *
  29  * @version   $version$
  30  */
  31
  32 public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
  33
  34         public void testAnalyzer() throws Exception {
  35                 FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
  36
  37                 assertAnalyzesTo(fa, "", new String[] {
  38                 });
  39
  40                 assertAnalyzesTo(
  41                         fa,
  42                         "chien chat cheval",
  43                         new String[] { "chien", "chat", "cheval" });
  44
  45                 assertAnalyzesTo(
  46                         fa,
  47                         "chien CHAT CHEVAL",
  48                         new String[] { "chien", "chat", "cheval" });
  49
  50                 assertAnalyzesTo(
  51                         fa,
  52                         "  chien  ,? + = -  CHAT /: > CHEVAL",
  53                         new String[] { "chien", "chat", "cheval" });
  54
  55                 assertAnalyzesTo(fa, "chien++", new String[] { "chien" });
  56
  57                 assertAnalyzesTo(
  58                         fa,
  59                         "mot \"entreguillemet\"",
  60                         new String[] { "mot", "entreguillemet" });
  61
  62                 // let's do some french specific tests now
  63
  64                 /* 1. couldn't resist
  65                  I would expect this to stay one term as in French the minus
  66                 sign is often used for composing words */
  67                 assertAnalyzesTo(
  68                         fa,
  69                         "Jean-François",
  70                         new String[] { "jean", "françois" });
  71
  72                 // 2. stopwords
  73                 assertAnalyzesTo(
  74                         fa,
  75                         "le la chien les aux chat du des à cheval",
  76                         new String[] { "chien", "chat", "cheval" });
  77
  78                 // some nouns and adjectives
  79                 assertAnalyzesTo(
  80                         fa,
  81                         "lances chismes habitable chiste éléments captifs",
  82                         new String[] {
  83                                 "lanc",
  84                                 "chism",
  85                                 "habit",
  86                                 "chist",
  87                                 "élément",
  88                                 "captif" });
  89
  90                 // some verbs
  91                 assertAnalyzesTo(
  92                         fa,
  93                         "finissions souffrirent rugissante",
  94                         new String[] { "fin", "souffr", "rug" });
  95
  96                 // some everything else
  97                 // aujourd'hui stays one term which is OK
  98                 assertAnalyzesTo(
  99                         fa,
 100                         "C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
 101                         new String[] {
 102                                 "c3po",
 103                                 "aujourd'hui",
 104                                 "oeuf",
 105                                 "ïâöûàä",
 106                                 "anticonstitutionnel",
 107                                 "jav" });
 108
 109                 // some more everything else
 110                 // here 1940-1945 stays as one term, 1940:1945 not ?
 111                 assertAnalyzesTo(
 112                         fa,
 113                         "33Bis 1940-1945 1940:1945 (---i+++)*",
 114                         new String[] { "33bis", "1940", "1945", "1940", "1945", "i" });
 115
 116         }
 117
 118         /**
 119          * @deprecated remove this test for Lucene 4.0
 120          */
 121         @Deprecated
 122         public void testAnalyzer30() throws Exception {
 123             FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_30);
 124
 125             assertAnalyzesTo(fa, "", new String[] {
 126             });
 127
 128             assertAnalyzesTo(
 129               fa,
 130               "chien chat cheval",
 131               new String[] { "chien", "chat", "cheval" });
 132
 133             assertAnalyzesTo(
 134               fa,
 135               "chien CHAT CHEVAL",
 136               new String[] { "chien", "chat", "cheval" });
 137
 138             assertAnalyzesTo(
 139               fa,
 140               "  chien  ,? + = -  CHAT /: > CHEVAL",
 141               new String[] { "chien", "chat", "cheval" });
 142
 143             assertAnalyzesTo(fa, "chien++", new String[] { "chien" });
 144
 145             assertAnalyzesTo(
 146               fa,
 147               "mot \"entreguillemet\"",
 148               new String[] { "mot", "entreguillemet" });
 149
 150             // let's do some french specific tests now
 151
 152             /* 1. couldn't resist
 153              I would expect this to stay one term as in French the minus
 154             sign is often used for composing words */
 155             assertAnalyzesTo(
 156               fa,
 157               "Jean-François",
 158               new String[] { "jean", "françois" });
 159
 160             // 2. stopwords
 161             assertAnalyzesTo(
 162               fa,
 163               "le la chien les aux chat du des à cheval",
 164               new String[] { "chien", "chat", "cheval" });
 165
 166             // some nouns and adjectives
 167             assertAnalyzesTo(
 168               fa,
 169               "lances chismes habitable chiste éléments captifs",
 170               new String[] {
 171                 "lanc",
 172                 "chism",
 173                 "habit",
 174                 "chist",
 175                 "élément",
 176                 "captif" });
 177
 178             // some verbs
 179             assertAnalyzesTo(
 180               fa,
 181               "finissions souffrirent rugissante",
 182               new String[] { "fin", "souffr", "rug" });
 183
 184             // some everything else
 185             // aujourd'hui stays one term which is OK
 186             assertAnalyzesTo(
 187               fa,
 188               "C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
 189               new String[] {
 190                 "c3po",
 191                 "aujourd'hui",
 192                 "oeuf",
 193                 "ïâöûàä",
 194                 "anticonstitutionnel",
 195                 "jav" });
 196
 197             // some more everything else
 198             // here 1940-1945 stays as one term, 1940:1945 not ?
 199             assertAnalyzesTo(
 200               fa,
 201               "33Bis 1940-1945 1940:1945 (---i+++)*",
 202               new String[] { "33bis", "1940-1945", "1940", "1945", "i" });
 203
 204           }
 205
 206         public void testReusableTokenStream() throws Exception {
 207           FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
 208           // stopwords
 209       assertAnalyzesToReuse(
 210           fa,
 211           "le la chien les aux chat du des à cheval",
 212           new String[] { "chien", "chat", "cheval" });
 213
 214       // some nouns and adjectives
 215       assertAnalyzesToReuse(
 216           fa,
 217           "lances chismes habitable chiste éléments captifs",
 218           new String[] {
 219               "lanc",
 220               "chism",
 221               "habit",
 222               "chist",
 223               "élément",
 224               "captif" });
 225         }
 226
 227         /*
 228          * Test that changes to the exclusion table are applied immediately
 229          * when using reusable token streams.
 230          */
 231         public void testExclusionTableReuse() throws Exception {
 232           FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
 233           assertAnalyzesToReuse(fa, "habitable", new String[] { "habit" });
 234           fa.setStemExclusionTable(new String[] { "habitable" });
 235           assertAnalyzesToReuse(fa, "habitable", new String[] { "habitable" });
 236         }
 237
 238   public void testExclusionTableViaCtor() throws Exception {
 239     CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
 240     set.add("habitable");
 241     FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT,
 242         CharArraySet.EMPTY_SET, set);
 243     assertAnalyzesToReuse(fa, "habitable chiste", new String[] { "habitable",
 244         "chist" });
 245
 246     fa = new FrenchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
 247     assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable",
 248         "chist" });
 249   }
 250
 251   public void testElision() throws Exception {
 252     FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
 253     assertAnalyzesTo(fa, "voir l'embrouille", new String[] { "voir", "embrouill" });
 254   }
 255
 256   /**
 257    * Prior to 3.1, this analyzer had no lowercase filter.
 258    * stopwords were case sensitive. Preserve this for back compat.
 259    * @deprecated Remove this test in Lucene 4.0
 260    */
 261   @Deprecated
 262   public void testBuggyStopwordsCasing() throws IOException {
 263     FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_30);
 264     assertAnalyzesTo(a, "Votre", new String[] { "votr" });
 265   }
 266
 267   /**
 268    * Test that stopwords are not case sensitive
 269    */
 270   public void testStopwordsCasing() throws IOException {
 271     FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_31);
 272     assertAnalyzesTo(a, "Votre", new String[] { });
 273   }
 274
 275   /** blast some random strings through the analyzer */
 276   public void testRandomStrings() throws Exception {
 277     checkRandomData(random, new FrenchAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
 278   }
 279 }