lucene-java-3.4.0/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java

   1 package org.apache.lucene.analysis.nl;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.File;
  21 import java.io.IOException;
  22
  23 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
  24 import org.apache.lucene.analysis.Analyzer;
  25 import org.apache.lucene.analysis.CharArraySet;
  26 import org.apache.lucene.util.Version;
  27
  28 /**
  29  * Test the Dutch Stem Filter, which only modifies the term text.
  30  *
  31  * The code states that it uses the snowball algorithm, but tests reveal some differences.
  32  *
  33  */
  34 public class TestDutchStemmer extends BaseTokenStreamTestCase {
  35
  36   public void testWithSnowballExamples() throws Exception {
  37          check("lichaamsziek", "lichaamsziek");
  38          check("lichamelijk", "licham");
  39          check("lichamelijke", "licham");
  40          check("lichamelijkheden", "licham");
  41          check("lichamen", "licham");
  42          check("lichere", "licher");
  43          check("licht", "licht");
  44          check("lichtbeeld", "lichtbeeld");
  45          check("lichtbruin", "lichtbruin");
  46          check("lichtdoorlatende", "lichtdoorlat");
  47          check("lichte", "licht");
  48          check("lichten", "licht");
  49          check("lichtende", "lichtend");
  50          check("lichtenvoorde", "lichtenvoord");
  51          check("lichter", "lichter");
  52          check("lichtere", "lichter");
  53          check("lichters", "lichter");
  54          check("lichtgevoeligheid", "lichtgevoel");
  55          check("lichtgewicht", "lichtgewicht");
  56          check("lichtgrijs", "lichtgrijs");
  57          check("lichthoeveelheid", "lichthoevel");
  58          check("lichtintensiteit", "lichtintensiteit");
  59          check("lichtje", "lichtj");
  60          check("lichtjes", "lichtjes");
  61          check("lichtkranten", "lichtkrant");
  62          check("lichtkring", "lichtkring");
  63          check("lichtkringen", "lichtkring");
  64          check("lichtregelsystemen", "lichtregelsystem");
  65          check("lichtste", "lichtst");
  66          check("lichtstromende", "lichtstrom");
  67          check("lichtte", "licht");
  68          check("lichtten", "licht");
  69          check("lichttoetreding", "lichttoetred");
  70          check("lichtverontreinigde", "lichtverontreinigd");
  71          check("lichtzinnige", "lichtzinn");
  72          check("lid", "lid");
  73          check("lidia", "lidia");
  74          check("lidmaatschap", "lidmaatschap");
  75          check("lidstaten", "lidstat");
  76          check("lidvereniging", "lidveren");
  77          check("opgingen", "opging");
  78          check("opglanzing", "opglanz");
  79          check("opglanzingen", "opglanz");
  80          check("opglimlachten", "opglimlacht");
  81          check("opglimpen", "opglimp");
  82          check("opglimpende", "opglimp");
  83          check("opglimping", "opglimp");
  84          check("opglimpingen", "opglimp");
  85          check("opgraven", "opgrav");
  86          check("opgrijnzen", "opgrijnz");
  87          check("opgrijzende", "opgrijz");
  88          check("opgroeien", "opgroei");
  89          check("opgroeiende", "opgroei");
  90          check("opgroeiplaats", "opgroeiplat");
  91          check("ophaal", "ophal");
  92          check("ophaaldienst", "ophaaldienst");
  93          check("ophaalkosten", "ophaalkost");
  94          check("ophaalsystemen", "ophaalsystem");
  95          check("ophaalt", "ophaalt");
  96          check("ophaaltruck", "ophaaltruck");
  97          check("ophalen", "ophal");
  98          check("ophalend", "ophal");
  99          check("ophalers", "ophaler");
 100          check("ophef", "ophef");
 101          check("opheldering", "ophelder");
 102          check("ophemelde", "ophemeld");
 103          check("ophemelen", "ophemel");
 104          check("opheusden", "opheusd");
 105          check("ophief", "ophief");
 106          check("ophield", "ophield");
 107          check("ophieven", "ophiev");
 108          check("ophoepelt", "ophoepelt");
 109          check("ophoog", "ophog");
 110          check("ophoogzand", "ophoogzand");
 111          check("ophopen", "ophop");
 112          check("ophoping", "ophop");
 113          check("ophouden", "ophoud");
 114   }
 115
 116   /**
 117    * @deprecated remove this test in Lucene 4.0
 118    */
 119   @Deprecated
 120   public void testOldBuggyStemmer() throws Exception {
 121     Analyzer a = new DutchAnalyzer(Version.LUCENE_30);
 122     checkOneTermReuse(a, "opheffen", "ophef"); // versus snowball 'opheff'
 123     checkOneTermReuse(a, "opheffende", "ophef"); // versus snowball 'opheff'
 124     checkOneTermReuse(a, "opheffing", "ophef"); // versus snowball 'opheff'
 125   }
 126
 127   public void testSnowballCorrectness() throws Exception {
 128     Analyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT);
 129     checkOneTermReuse(a, "opheffen", "opheff");
 130     checkOneTermReuse(a, "opheffende", "opheff");
 131     checkOneTermReuse(a, "opheffing", "opheff");
 132   }
 133
 134   public void testReusableTokenStream() throws Exception {
 135     Analyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT);
 136     checkOneTermReuse(a, "lichaamsziek", "lichaamsziek");
 137     checkOneTermReuse(a, "lichamelijk", "licham");
 138     checkOneTermReuse(a, "lichamelijke", "licham");
 139     checkOneTermReuse(a, "lichamelijkheden", "licham");
 140   }
 141
 142   /*
 143    * Test that changes to the exclusion table are applied immediately
 144    * when using reusable token streams.
 145    */
 146   public void testExclusionTableReuse() throws Exception {
 147     DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT);
 148     checkOneTermReuse(a, "lichamelijk", "licham");
 149     a.setStemExclusionTable(new String[] { "lichamelijk" });
 150     checkOneTermReuse(a, "lichamelijk", "lichamelijk");
 151
 152
 153   }
 154
 155   public void testExclusionTableViaCtor() throws IOException {
 156     CharArraySet set = new CharArraySet(Version.LUCENE_30, 1, true);
 157     set.add("lichamelijk");
 158     DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
 159     assertAnalyzesToReuse(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
 160
 161     a = new DutchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
 162     assertAnalyzesTo(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
 163
 164   }
 165
 166   /*
 167    * Test that changes to the dictionary stemming table are applied immediately
 168    * when using reusable token streams.
 169    */
 170   public void testStemDictionaryReuse() throws Exception {
 171     DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT);
 172     checkOneTermReuse(a, "lichamelijk", "licham");
 173     File customDictFile = getDataFile("customStemDict.txt");
 174     a.setStemDictionary(customDictFile);
 175     checkOneTermReuse(a, "lichamelijk", "somethingentirelydifferent");
 176   }
 177
 178   /**
 179    * Prior to 3.1, this analyzer had no lowercase filter.
 180    * stopwords were case sensitive. Preserve this for back compat.
 181    * @deprecated Remove this test in Lucene 4.0
 182    */
 183   @Deprecated
 184   public void testBuggyStopwordsCasing() throws IOException {
 185     DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_30);
 186     assertAnalyzesTo(a, "Zelf", new String[] { "zelf" });
 187   }
 188
 189   /**
 190    * Test that stopwords are not case sensitive
 191    */
 192   public void testStopwordsCasing() throws IOException {
 193     DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_31);
 194     assertAnalyzesTo(a, "Zelf", new String[] { });
 195   }
 196
 197   private void check(final String input, final String expected) throws Exception {
 198     checkOneTerm(new DutchAnalyzer(TEST_VERSION_CURRENT), input, expected);
 199   }
 200
 201   /** blast some random strings through the analyzer */
 202   public void testRandomStrings() throws Exception {
 203     checkRandomData(random, new DutchAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
 204   }
 205
 206 }