lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java

   1 package org.apache.lucene.analysis.nl;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.util.Map;
  21
  22 /**
  23  * A stemmer for Dutch words.
  24  * <p>
  25  * The algorithm is an implementation of
  26  * the <a href="http://snowball.tartarus.org/algorithms/dutch/stemmer.html">dutch stemming</a>
  27  * algorithm in Martin Porter's snowball project.
  28  * </p>
  29  * @deprecated Use {@link org.tartarus.snowball.ext.DutchStemmer} instead,
  30  * which has the same functionality. This filter will be removed in Lucene 5.0
  31  */
  32 @Deprecated
  33 public class DutchStemmer {
  34   /**
  35    * Buffer for the terms while stemming them.
  36    */
  37   private StringBuilder sb = new StringBuilder();
  38   private boolean _removedE;
  39   private Map _stemDict;
  40
  41   private int _R1;
  42   private int _R2;
  43
  44   //TODO convert to internal
  45   /*
  46    * Stems the given term to an unique <tt>discriminator</tt>.
  47    *
  48    * @param term The term that should be stemmed.
  49    * @return Discriminator for <tt>term</tt>
  50    */
  51   public String stem(String term) {
  52     term = term.toLowerCase();
  53     if (!isStemmable(term))
  54       return term;
  55     if (_stemDict != null && _stemDict.containsKey(term))
  56       if (_stemDict.get(term) instanceof String)
  57         return (String) _stemDict.get(term);
  58       else
  59         return null;
  60
  61     // Reset the StringBuilder.
  62     sb.delete(0, sb.length());
  63     sb.insert(0, term);
  64     // Stemming starts here...
  65     substitute(sb);
  66     storeYandI(sb);
  67     _R1 = getRIndex(sb, 0);
  68     _R1 = Math.max(3, _R1);
  69     step1(sb);
  70     step2(sb);
  71     _R2 = getRIndex(sb, _R1);
  72     step3a(sb);
  73     step3b(sb);
  74     step4(sb);
  75     reStoreYandI(sb);
  76     return sb.toString();
  77   }
  78
  79   private boolean enEnding(StringBuilder sb) {
  80     String[] enend = new String[]{"ene", "en"};
  81     for (int i = 0; i < enend.length; i++) {
  82       String end = enend[i];
  83       String s = sb.toString();
  84       int index = s.length() - end.length();
  85       if (s.endsWith(end) &&
  86           index >= _R1 &&
  87           isValidEnEnding(sb, index - 1)
  88       ) {
  89         sb.delete(index, index + end.length());
  90         unDouble(sb, index);
  91         return true;
  92       }
  93     }
  94     return false;
  95   }
  96
  97
  98   private void step1(StringBuilder sb) {
  99     if (_R1 >= sb.length())
 100       return;
 101
 102     String s = sb.toString();
 103     int lengthR1 = sb.length() - _R1;
 104     int index;
 105
 106     if (s.endsWith("heden")) {
 107       sb.replace(_R1, lengthR1 + _R1, sb.substring(_R1, lengthR1 + _R1).replaceAll("heden", "heid"));
 108       return;
 109     }
 110
 111     if (enEnding(sb))
 112       return;
 113
 114     if (s.endsWith("se") &&
 115         (index = s.length() - 2) >= _R1 &&
 116         isValidSEnding(sb, index - 1)
 117     ) {
 118       sb.delete(index, index + 2);
 119       return;
 120     }
 121     if (s.endsWith("s") &&
 122         (index = s.length() - 1) >= _R1 &&
 123         isValidSEnding(sb, index - 1)) {
 124       sb.delete(index, index + 1);
 125     }
 126   }
 127
 128   /**
 129    * Delete suffix e if in R1 and
 130    * preceded by a non-vowel, and then undouble the ending
 131    *
 132    * @param sb String being stemmed
 133    */
 134   private void step2(StringBuilder sb) {
 135     _removedE = false;
 136     if (_R1 >= sb.length())
 137       return;
 138     String s = sb.toString();
 139     int index = s.length() - 1;
 140     if (index >= _R1 &&
 141         s.endsWith("e") &&
 142         !isVowel(sb.charAt(index - 1))) {
 143       sb.delete(index, index + 1);
 144       unDouble(sb);
 145       _removedE = true;
 146     }
 147   }
 148
 149   /**
 150    * Delete "heid"
 151    *
 152    * @param sb String being stemmed
 153    */
 154   private void step3a(StringBuilder sb) {
 155     if (_R2 >= sb.length())
 156       return;
 157     String s = sb.toString();
 158     int index = s.length() - 4;
 159     if (s.endsWith("heid") && index >= _R2 && sb.charAt(index - 1) != 'c') {
 160       sb.delete(index, index + 4); //remove heid
 161       enEnding(sb);
 162     }
 163   }
 164
 165   /**
 166    * <p>A d-suffix, or derivational suffix, enables a new word,
 167    * often with a different grammatical category, or with a different
 168    * sense, to be built from another word. Whether a d-suffix can be
 169    * attached is discovered not from the rules of grammar, but by
 170    * referring to a dictionary. So in English, ness can be added to
 171    * certain adjectives to form corresponding nouns (littleness,
 172    * kindness, foolishness ...) but not to all adjectives
 173    * (not for example, to big, cruel, wise ...) d-suffixes can be
 174    * used to change meaning, often in rather exotic ways.</p>
 175    * Remove "ing", "end", "ig", "lijk", "baar" and "bar"
 176    *
 177    * @param sb String being stemmed
 178    */
 179   private void step3b(StringBuilder sb) {
 180     if (_R2 >= sb.length())
 181       return;
 182     String s = sb.toString();
 183     int index = 0;
 184
 185     if ((s.endsWith("end") || s.endsWith("ing")) &&
 186         (index = s.length() - 3) >= _R2) {
 187       sb.delete(index, index + 3);
 188       if (sb.charAt(index - 2) == 'i' &&
 189           sb.charAt(index - 1) == 'g') {
 190         if (sb.charAt(index - 3) != 'e' & index - 2 >= _R2) {
 191           index -= 2;
 192           sb.delete(index, index + 2);
 193         }
 194       } else {
 195         unDouble(sb, index);
 196       }
 197       return;
 198     }
 199     if (s.endsWith("ig") &&
 200         (index = s.length() - 2) >= _R2
 201     ) {
 202       if (sb.charAt(index - 1) != 'e')
 203         sb.delete(index, index + 2);
 204       return;
 205     }
 206     if (s.endsWith("lijk") &&
 207         (index = s.length() - 4) >= _R2
 208     ) {
 209       sb.delete(index, index + 4);
 210       step2(sb);
 211       return;
 212     }
 213     if (s.endsWith("baar") &&
 214         (index = s.length() - 4) >= _R2
 215     ) {
 216       sb.delete(index, index + 4);
 217       return;
 218     }
 219     if (s.endsWith("bar") &&
 220         (index = s.length() - 3) >= _R2
 221     ) {
 222       if (_removedE)
 223         sb.delete(index, index + 3);
 224       return;
 225     }
 226   }
 227
 228   /**
 229    * undouble vowel
 230    * If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod).
 231    *
 232    * @param sb String being stemmed
 233    */
 234   private void step4(StringBuilder sb) {
 235     if (sb.length() < 4)
 236       return;
 237     String end = sb.substring(sb.length() - 4, sb.length());
 238     char c = end.charAt(0);
 239     char v1 = end.charAt(1);
 240     char v2 = end.charAt(2);
 241     char d = end.charAt(3);
 242     if (v1 == v2 &&
 243         d != 'I' &&
 244         v1 != 'i' &&
 245         isVowel(v1) &&
 246         !isVowel(d) &&
 247         !isVowel(c)) {
 248       sb.delete(sb.length() - 2, sb.length() - 1);
 249     }
 250   }
 251
 252   /**
 253    * Checks if a term could be stemmed.
 254    *
 255    * @return true if, and only if, the given term consists in letters.
 256    */
 257   private boolean isStemmable(String term) {
 258     for (int c = 0; c < term.length(); c++) {
 259       if (!Character.isLetter(term.charAt(c))) return false;
 260     }
 261     return true;
 262   }
 263
 264   /**
 265    * Substitute ä, ë, ï, ö, ü, á , é, í, ó, ú
 266    */
 267   private void substitute(StringBuilder buffer) {
 268     for (int i = 0; i < buffer.length(); i++) {
 269       switch (buffer.charAt(i)) {
 270         case 'ä':
 271         case 'á':
 272           {
 273             buffer.setCharAt(i, 'a');
 274             break;
 275           }
 276         case 'ë':
 277         case 'é':
 278           {
 279             buffer.setCharAt(i, 'e');
 280             break;
 281           }
 282         case 'ü':
 283         case 'ú':
 284           {
 285             buffer.setCharAt(i, 'u');
 286             break;
 287           }
 288         case 'ï':
 289         case 'i':
 290           {
 291             buffer.setCharAt(i, 'i');
 292             break;
 293           }
 294         case 'ö':
 295         case 'ó':
 296           {
 297             buffer.setCharAt(i, 'o');
 298             break;
 299           }
 300       }
 301     }
 302   }
 303
 304   /*private boolean isValidSEnding(StringBuilder sb) {
 305     return isValidSEnding(sb, sb.length() - 1);
 306   }*/
 307
 308   private boolean isValidSEnding(StringBuilder sb, int index) {
 309     char c = sb.charAt(index);
 310     if (isVowel(c) || c == 'j')
 311       return false;
 312     return true;
 313   }
 314
 315   /*private boolean isValidEnEnding(StringBuilder sb) {
 316     return isValidEnEnding(sb, sb.length() - 1);
 317   }*/
 318
 319   private boolean isValidEnEnding(StringBuilder sb, int index) {
 320     char c = sb.charAt(index);
 321     if (isVowel(c))
 322       return false;
 323     if (c < 3)
 324       return false;
 325     // ends with "gem"?
 326     if (c == 'm' && sb.charAt(index - 2) == 'g' && sb.charAt(index - 1) == 'e')
 327       return false;
 328     return true;
 329   }
 330
 331   private void unDouble(StringBuilder sb) {
 332     unDouble(sb, sb.length());
 333   }
 334
 335   private void unDouble(StringBuilder sb, int endIndex) {
 336     String s = sb.substring(0, endIndex);
 337     if (s.endsWith("kk") || s.endsWith("tt") || s.endsWith("dd") || s.endsWith("nn") || s.endsWith("mm") || s.endsWith("ff")) {
 338       sb.delete(endIndex - 1, endIndex);
 339     }
 340   }
 341
 342   private int getRIndex(StringBuilder sb, int start) {
 343     if (start == 0)
 344       start = 1;
 345     int i = start;
 346     for (; i < sb.length(); i++) {
 347       //first non-vowel preceded by a vowel
 348       if (!isVowel(sb.charAt(i)) && isVowel(sb.charAt(i - 1))) {
 349         return i + 1;
 350       }
 351     }
 352     return i + 1;
 353   }
 354
 355   private void storeYandI(StringBuilder sb) {
 356     if (sb.charAt(0) == 'y')
 357       sb.setCharAt(0, 'Y');
 358
 359     int last = sb.length() - 1;
 360
 361     for (int i = 1; i < last; i++) {
 362       switch (sb.charAt(i)) {
 363         case 'i':
 364           {
 365             if (isVowel(sb.charAt(i - 1)) &&
 366                 isVowel(sb.charAt(i + 1))
 367             )
 368               sb.setCharAt(i, 'I');
 369             break;
 370           }
 371         case 'y':
 372           {
 373             if (isVowel(sb.charAt(i - 1)))
 374               sb.setCharAt(i, 'Y');
 375             break;
 376           }
 377       }
 378     }
 379     if (last > 0 && sb.charAt(last) == 'y' && isVowel(sb.charAt(last - 1)))
 380       sb.setCharAt(last, 'Y');
 381   }
 382
 383   private void reStoreYandI(StringBuilder sb) {
 384     String tmp = sb.toString();
 385     sb.delete(0, sb.length());
 386     sb.insert(0, tmp.replaceAll("I", "i").replaceAll("Y", "y"));
 387   }
 388
 389   private boolean isVowel(char c) {
 390     switch (c) {
 391       case 'e':
 392       case 'a':
 393       case 'o':
 394       case 'i':
 395       case 'u':
 396       case 'y':
 397       case 'è':
 398         {
 399           return true;
 400         }
 401     }
 402     return false;
 403   }
 404
 405   void setStemDictionary(Map dict) {
 406     _stemDict = dict;
 407   }
 408
 409 }