1 package org.apache.lucene.analysis.nl;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
23 * A stemmer for Dutch words.
25 * The algorithm is an implementation of
26 * the <a href="http://snowball.tartarus.org/algorithms/dutch/stemmer.html">dutch stemming</a>
27 * algorithm in Martin Porter's snowball project.
29 * @deprecated Use {@link org.tartarus.snowball.ext.DutchStemmer} instead,
30 * which has the same functionality. This filter will be removed in Lucene 5.0
33 public class DutchStemmer {
35 * Buffer for the terms while stemming them.
37 private StringBuilder sb = new StringBuilder();
38 private boolean _removedE;
39 private Map _stemDict;
44 //TODO convert to internal
46 * Stems the given term to an unique <tt>discriminator</tt>.
48 * @param term The term that should be stemmed.
49 * @return Discriminator for <tt>term</tt>
51 public String stem(String term) {
52 term = term.toLowerCase();
53 if (!isStemmable(term))
55 if (_stemDict != null && _stemDict.containsKey(term))
56 if (_stemDict.get(term) instanceof String)
57 return (String) _stemDict.get(term);
61 // Reset the StringBuilder.
62 sb.delete(0, sb.length());
64 // Stemming starts here...
67 _R1 = getRIndex(sb, 0);
68 _R1 = Math.max(3, _R1);
71 _R2 = getRIndex(sb, _R1);
79 private boolean enEnding(StringBuilder sb) {
80 String[] enend = new String[]{"ene", "en"};
81 for (int i = 0; i < enend.length; i++) {
82 String end = enend[i];
83 String s = sb.toString();
84 int index = s.length() - end.length();
85 if (s.endsWith(end) &&
87 isValidEnEnding(sb, index - 1)
89 sb.delete(index, index + end.length());
98 private void step1(StringBuilder sb) {
99 if (_R1 >= sb.length())
102 String s = sb.toString();
103 int lengthR1 = sb.length() - _R1;
106 if (s.endsWith("heden")) {
107 sb.replace(_R1, lengthR1 + _R1, sb.substring(_R1, lengthR1 + _R1).replaceAll("heden", "heid"));
114 if (s.endsWith("se") &&
115 (index = s.length() - 2) >= _R1 &&
116 isValidSEnding(sb, index - 1)
118 sb.delete(index, index + 2);
121 if (s.endsWith("s") &&
122 (index = s.length() - 1) >= _R1 &&
123 isValidSEnding(sb, index - 1)) {
124 sb.delete(index, index + 1);
129 * Delete suffix e if in R1 and
130 * preceded by a non-vowel, and then undouble the ending
132 * @param sb String being stemmed
134 private void step2(StringBuilder sb) {
136 if (_R1 >= sb.length())
138 String s = sb.toString();
139 int index = s.length() - 1;
142 !isVowel(sb.charAt(index - 1))) {
143 sb.delete(index, index + 1);
152 * @param sb String being stemmed
154 private void step3a(StringBuilder sb) {
155 if (_R2 >= sb.length())
157 String s = sb.toString();
158 int index = s.length() - 4;
159 if (s.endsWith("heid") && index >= _R2 && sb.charAt(index - 1) != 'c') {
160 sb.delete(index, index + 4); //remove heid
166 * <p>A d-suffix, or derivational suffix, enables a new word,
167 * often with a different grammatical category, or with a different
168 * sense, to be built from another word. Whether a d-suffix can be
169 * attached is discovered not from the rules of grammar, but by
170 * referring to a dictionary. So in English, ness can be added to
171 * certain adjectives to form corresponding nouns (littleness,
172 * kindness, foolishness ...) but not to all adjectives
173 * (not for example, to big, cruel, wise ...) d-suffixes can be
174 * used to change meaning, often in rather exotic ways.</p>
175 * Remove "ing", "end", "ig", "lijk", "baar" and "bar"
177 * @param sb String being stemmed
179 private void step3b(StringBuilder sb) {
180 if (_R2 >= sb.length())
182 String s = sb.toString();
185 if ((s.endsWith("end") || s.endsWith("ing")) &&
186 (index = s.length() - 3) >= _R2) {
187 sb.delete(index, index + 3);
188 if (sb.charAt(index - 2) == 'i' &&
189 sb.charAt(index - 1) == 'g') {
190 if (sb.charAt(index - 3) != 'e' & index - 2 >= _R2) {
192 sb.delete(index, index + 2);
199 if (s.endsWith("ig") &&
200 (index = s.length() - 2) >= _R2
202 if (sb.charAt(index - 1) != 'e')
203 sb.delete(index, index + 2);
206 if (s.endsWith("lijk") &&
207 (index = s.length() - 4) >= _R2
209 sb.delete(index, index + 4);
213 if (s.endsWith("baar") &&
214 (index = s.length() - 4) >= _R2
216 sb.delete(index, index + 4);
219 if (s.endsWith("bar") &&
220 (index = s.length() - 3) >= _R2
223 sb.delete(index, index + 3);
230 * If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod).
232 * @param sb String being stemmed
234 private void step4(StringBuilder sb) {
237 String end = sb.substring(sb.length() - 4, sb.length());
238 char c = end.charAt(0);
239 char v1 = end.charAt(1);
240 char v2 = end.charAt(2);
241 char d = end.charAt(3);
248 sb.delete(sb.length() - 2, sb.length() - 1);
253 * Checks if a term could be stemmed.
255 * @return true if, and only if, the given term consists in letters.
257 private boolean isStemmable(String term) {
258 for (int c = 0; c < term.length(); c++) {
259 if (!Character.isLetter(term.charAt(c))) return false;
265 * Substitute ä, ë, ï, ö, ü, á , é, í, ó, ú
267 private void substitute(StringBuilder buffer) {
268 for (int i = 0; i < buffer.length(); i++) {
269 switch (buffer.charAt(i)) {
273 buffer.setCharAt(i, 'a');
279 buffer.setCharAt(i, 'e');
285 buffer.setCharAt(i, 'u');
291 buffer.setCharAt(i, 'i');
297 buffer.setCharAt(i, 'o');
304 /*private boolean isValidSEnding(StringBuilder sb) {
305 return isValidSEnding(sb, sb.length() - 1);
308 private boolean isValidSEnding(StringBuilder sb, int index) {
309 char c = sb.charAt(index);
310 if (isVowel(c) || c == 'j')
315 /*private boolean isValidEnEnding(StringBuilder sb) {
316 return isValidEnEnding(sb, sb.length() - 1);
319 private boolean isValidEnEnding(StringBuilder sb, int index) {
320 char c = sb.charAt(index);
326 if (c == 'm' && sb.charAt(index - 2) == 'g' && sb.charAt(index - 1) == 'e')
331 private void unDouble(StringBuilder sb) {
332 unDouble(sb, sb.length());
335 private void unDouble(StringBuilder sb, int endIndex) {
336 String s = sb.substring(0, endIndex);
337 if (s.endsWith("kk") || s.endsWith("tt") || s.endsWith("dd") || s.endsWith("nn") || s.endsWith("mm") || s.endsWith("ff")) {
338 sb.delete(endIndex - 1, endIndex);
342 private int getRIndex(StringBuilder sb, int start) {
346 for (; i < sb.length(); i++) {
347 //first non-vowel preceded by a vowel
348 if (!isVowel(sb.charAt(i)) && isVowel(sb.charAt(i - 1))) {
355 private void storeYandI(StringBuilder sb) {
356 if (sb.charAt(0) == 'y')
357 sb.setCharAt(0, 'Y');
359 int last = sb.length() - 1;
361 for (int i = 1; i < last; i++) {
362 switch (sb.charAt(i)) {
365 if (isVowel(sb.charAt(i - 1)) &&
366 isVowel(sb.charAt(i + 1))
368 sb.setCharAt(i, 'I');
373 if (isVowel(sb.charAt(i - 1)))
374 sb.setCharAt(i, 'Y');
379 if (last > 0 && sb.charAt(last) == 'y' && isVowel(sb.charAt(last - 1)))
380 sb.setCharAt(last, 'Y');
383 private void reStoreYandI(StringBuilder sb) {
384 String tmp = sb.toString();
385 sb.delete(0, sb.length());
386 sb.insert(0, tmp.replaceAll("I", "i").replaceAll("Y", "y"));
389 private boolean isVowel(char c) {
405 void setStemDictionary(Map dict) {