1 package org.apache.lucene.analysis.nl;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 import java.io.IOException;
23 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
24 import org.apache.lucene.analysis.Analyzer;
25 import org.apache.lucene.analysis.CharArraySet;
26 import org.apache.lucene.util.Version;
29 * Test the Dutch Stem Filter, which only modifies the term text.
31 * The code states that it uses the snowball algorithm, but tests reveal some differences.
34 public class TestDutchStemmer extends BaseTokenStreamTestCase {
36 public void testWithSnowballExamples() throws Exception {
37 check("lichaamsziek", "lichaamsziek");
38 check("lichamelijk", "licham");
39 check("lichamelijke", "licham");
40 check("lichamelijkheden", "licham");
41 check("lichamen", "licham");
42 check("lichere", "licher");
43 check("licht", "licht");
44 check("lichtbeeld", "lichtbeeld");
45 check("lichtbruin", "lichtbruin");
46 check("lichtdoorlatende", "lichtdoorlat");
47 check("lichte", "licht");
48 check("lichten", "licht");
49 check("lichtende", "lichtend");
50 check("lichtenvoorde", "lichtenvoord");
51 check("lichter", "lichter");
52 check("lichtere", "lichter");
53 check("lichters", "lichter");
54 check("lichtgevoeligheid", "lichtgevoel");
55 check("lichtgewicht", "lichtgewicht");
56 check("lichtgrijs", "lichtgrijs");
57 check("lichthoeveelheid", "lichthoevel");
58 check("lichtintensiteit", "lichtintensiteit");
59 check("lichtje", "lichtj");
60 check("lichtjes", "lichtjes");
61 check("lichtkranten", "lichtkrant");
62 check("lichtkring", "lichtkring");
63 check("lichtkringen", "lichtkring");
64 check("lichtregelsystemen", "lichtregelsystem");
65 check("lichtste", "lichtst");
66 check("lichtstromende", "lichtstrom");
67 check("lichtte", "licht");
68 check("lichtten", "licht");
69 check("lichttoetreding", "lichttoetred");
70 check("lichtverontreinigde", "lichtverontreinigd");
71 check("lichtzinnige", "lichtzinn");
73 check("lidia", "lidia");
74 check("lidmaatschap", "lidmaatschap");
75 check("lidstaten", "lidstat");
76 check("lidvereniging", "lidveren");
77 check("opgingen", "opging");
78 check("opglanzing", "opglanz");
79 check("opglanzingen", "opglanz");
80 check("opglimlachten", "opglimlacht");
81 check("opglimpen", "opglimp");
82 check("opglimpende", "opglimp");
83 check("opglimping", "opglimp");
84 check("opglimpingen", "opglimp");
85 check("opgraven", "opgrav");
86 check("opgrijnzen", "opgrijnz");
87 check("opgrijzende", "opgrijz");
88 check("opgroeien", "opgroei");
89 check("opgroeiende", "opgroei");
90 check("opgroeiplaats", "opgroeiplat");
91 check("ophaal", "ophal");
92 check("ophaaldienst", "ophaaldienst");
93 check("ophaalkosten", "ophaalkost");
94 check("ophaalsystemen", "ophaalsystem");
95 check("ophaalt", "ophaalt");
96 check("ophaaltruck", "ophaaltruck");
97 check("ophalen", "ophal");
98 check("ophalend", "ophal");
99 check("ophalers", "ophaler");
100 check("ophef", "ophef");
101 check("opheldering", "ophelder");
102 check("ophemelde", "ophemeld");
103 check("ophemelen", "ophemel");
104 check("opheusden", "opheusd");
105 check("ophief", "ophief");
106 check("ophield", "ophield");
107 check("ophieven", "ophiev");
108 check("ophoepelt", "ophoepelt");
109 check("ophoog", "ophog");
110 check("ophoogzand", "ophoogzand");
111 check("ophopen", "ophop");
112 check("ophoping", "ophop");
113 check("ophouden", "ophoud");
117 * @deprecated remove this test in Lucene 4.0
120 public void testOldBuggyStemmer() throws Exception {
121 Analyzer a = new DutchAnalyzer(Version.LUCENE_30);
122 checkOneTermReuse(a, "opheffen", "ophef"); // versus snowball 'opheff'
123 checkOneTermReuse(a, "opheffende", "ophef"); // versus snowball 'opheff'
124 checkOneTermReuse(a, "opheffing", "ophef"); // versus snowball 'opheff'
127 public void testSnowballCorrectness() throws Exception {
128 Analyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT);
129 checkOneTermReuse(a, "opheffen", "opheff");
130 checkOneTermReuse(a, "opheffende", "opheff");
131 checkOneTermReuse(a, "opheffing", "opheff");
134 public void testReusableTokenStream() throws Exception {
135 Analyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT);
136 checkOneTermReuse(a, "lichaamsziek", "lichaamsziek");
137 checkOneTermReuse(a, "lichamelijk", "licham");
138 checkOneTermReuse(a, "lichamelijke", "licham");
139 checkOneTermReuse(a, "lichamelijkheden", "licham");
143 * Test that changes to the exclusion table are applied immediately
144 * when using reusable token streams.
146 public void testExclusionTableReuse() throws Exception {
147 DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT);
148 checkOneTermReuse(a, "lichamelijk", "licham");
149 a.setStemExclusionTable(new String[] { "lichamelijk" });
150 checkOneTermReuse(a, "lichamelijk", "lichamelijk");
155 public void testExclusionTableViaCtor() throws IOException {
156 CharArraySet set = new CharArraySet(Version.LUCENE_30, 1, true);
157 set.add("lichamelijk");
158 DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
159 assertAnalyzesToReuse(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
161 a = new DutchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
162 assertAnalyzesTo(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
167 * Test that changes to the dictionary stemming table are applied immediately
168 * when using reusable token streams.
170 public void testStemDictionaryReuse() throws Exception {
171 DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT);
172 checkOneTermReuse(a, "lichamelijk", "licham");
173 File customDictFile = getDataFile("customStemDict.txt");
174 a.setStemDictionary(customDictFile);
175 checkOneTermReuse(a, "lichamelijk", "somethingentirelydifferent");
179 * Prior to 3.1, this analyzer had no lowercase filter.
180 * stopwords were case sensitive. Preserve this for back compat.
181 * @deprecated Remove this test in Lucene 4.0
184 public void testBuggyStopwordsCasing() throws IOException {
185 DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_30);
186 assertAnalyzesTo(a, "Zelf", new String[] { "zelf" });
190 * Test that stopwords are not case sensitive
192 public void testStopwordsCasing() throws IOException {
193 DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_31);
194 assertAnalyzesTo(a, "Zelf", new String[] { });
197 private void check(final String input, final String expected) throws Exception {
198 checkOneTerm(new DutchAnalyzer(TEST_VERSION_CURRENT), input, expected);
201 /** blast some random strings through the analyzer */
202 public void testRandomStrings() throws Exception {
203 checkRandomData(random, new DutchAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);