lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java

   1 package org.apache.lucene.analysis.cz;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import static org.apache.lucene.analysis.util.StemmerUtil.*;
  21
  22 /**
  23  * Light Stemmer for Czech.
  24  * <p>
  25  * Implements the algorithm described in:
  26  * <i>
  27  * Indexing and stemming approaches for the Czech language
  28  * </i>
  29  * http://portal.acm.org/citation.cfm?id=1598600
  30  * </p>
  31  */
  32 public class CzechStemmer {
  33
  34   /**
  35    * Stem an input buffer of Czech text.
  36    *
  37    * @param s input buffer
  38    * @param len length of input buffer
  39    * @return length of input buffer after normalization
  40    *
  41    * <p><b>NOTE</b>: Input is expected to be in lowercase,
  42    * but with diacritical marks</p>
  43    */
  44   public int stem(char s[], int len) {
  45     len = removeCase(s, len);
  46     len = removePossessives(s, len);
  47     len = normalize(s, len);
  48     return len;
  49   }
  50
  51   private int removeCase(char s[], int len) {
  52     if (len > 7 && endsWith(s, len, "atech"))
  53       return len - 5;
  54
  55     if (len > 6 &&
  56         (endsWith(s, len,"ětem") ||
  57         endsWith(s, len,"etem") ||
  58         endsWith(s, len,"atům")))
  59       return len - 4;
  60
  61     if (len > 5 &&
  62         (endsWith(s, len, "ech") ||
  63         endsWith(s, len, "ich") ||
  64         endsWith(s, len, "ích") ||
  65         endsWith(s, len, "ého") ||
  66         endsWith(s, len, "ěmi") ||
  67         endsWith(s, len, "emi") ||
  68         endsWith(s, len, "ému") ||
  69         endsWith(s, len, "ěte") ||
  70         endsWith(s, len, "ete") ||
  71         endsWith(s, len, "ěti") ||
  72         endsWith(s, len, "eti") ||
  73         endsWith(s, len, "ího") ||
  74         endsWith(s, len, "iho") ||
  75         endsWith(s, len, "ími") ||
  76         endsWith(s, len, "ímu") ||
  77         endsWith(s, len, "imu") ||
  78         endsWith(s, len, "ách") ||
  79         endsWith(s, len, "ata") ||
  80         endsWith(s, len, "aty") ||
  81         endsWith(s, len, "ých") ||
  82         endsWith(s, len, "ama") ||
  83         endsWith(s, len, "ami") ||
  84         endsWith(s, len, "ové") ||
  85         endsWith(s, len, "ovi") ||
  86         endsWith(s, len, "ými")))
  87       return len - 3;
  88
  89     if (len > 4 &&
  90         (endsWith(s, len, "em") ||
  91         endsWith(s, len, "es") ||
  92         endsWith(s, len, "ém") ||
  93         endsWith(s, len, "ím") ||
  94         endsWith(s, len, "ům") ||
  95         endsWith(s, len, "at") ||
  96         endsWith(s, len, "ám") ||
  97         endsWith(s, len, "os") ||
  98         endsWith(s, len, "us") ||
  99         endsWith(s, len, "ým") ||
 100         endsWith(s, len, "mi") ||
 101         endsWith(s, len, "ou")))
 102       return len - 2;
 103
 104     if (len > 3) {
 105       switch (s[len - 1]) {
 106         case 'a':
 107         case 'e':
 108         case 'i':
 109         case 'o':
 110         case 'u':
 111         case 'ů':
 112         case 'y':
 113         case 'á':
 114         case 'é':
 115         case 'í':
 116         case 'ý':
 117         case 'ě':
 118           return len - 1;
 119       }
 120     }
 121
 122     return len;
 123   }
 124
 125   private int removePossessives(char s[], int len) {
 126     if (len > 5 &&
 127         (endsWith(s, len, "ov") ||
 128         endsWith(s, len, "in") ||
 129         endsWith(s, len, "ův")))
 130       return len - 2;
 131
 132     return len;
 133   }
 134
 135   private int normalize(char s[], int len) {
 136     if (endsWith(s, len, "čt")) { // čt -> ck
 137       s[len - 2] = 'c';
 138       s[len - 1] = 'k';
 139       return len;
 140     }
 141
 142     if (endsWith(s, len, "št")) { // št -> sk
 143       s[len - 2] = 's';
 144       s[len - 1] = 'k';
 145       return len;
 146     }
 147
 148     switch(s[len - 1]) {
 149       case 'c': // [cč] -> k
 150       case 'č':
 151         s[len - 1] = 'k';
 152         return len;
 153       case 'z': // [zž] -> h
 154       case 'ž':
 155         s[len - 1] = 'h';
 156         return len;
 157     }
 158
 159     if (len > 1 && s[len - 2] == 'e') {
 160       s[len - 2] = s[len - 1]; // e* > *
 161       return len - 1;
 162     }
 163
 164     if (len > 2 && s[len - 2] == 'ů') {
 165       s[len - 2] = 'o'; // *ů* -> *o*
 166       return len;
 167     }
 168
 169     return len;
 170   }
 171 }