lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java

   1 package org.apache.lucene.analysis.bg;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import static org.apache.lucene.analysis.util.StemmerUtil.*;
  21
  22 /**
  23  * Light Stemmer for Bulgarian.
  24  * <p>
  25  * Implements the algorithm described in:
  26  * <i>
  27  * Searching Strategies for the Bulgarian Language
  28  * </i>
  29  * http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
  30  */
  31 public class BulgarianStemmer {
  32
  33   /**
  34    * Stem an input buffer of Bulgarian text.
  35    *
  36    * @param s input buffer
  37    * @param len length of input buffer
  38    * @return length of input buffer after normalization
  39    */
  40   public int stem(final char s[], int len) {
  41     if (len < 4) // do not stem
  42       return len;
  43
  44     if (len > 5 && endsWith(s, len, "ища"))
  45       return len - 3;
  46
  47     len = removeArticle(s, len);
  48     len = removePlural(s, len);
  49
  50     if (len > 3) {
  51       if (endsWith(s, len, "я"))
  52         len--;
  53       if (endsWith(s, len, "а") ||
  54           endsWith(s, len, "о") ||
  55           endsWith(s, len, "е"))
  56         len--;
  57     }
  58
  59     // the rule to rewrite ен -> н is duplicated in the paper.
  60     // in the perl implementation referenced by the paper, this is fixed.
  61     // (it is fixed here as well)
  62     if (len > 4 && endsWith(s, len, "ен")) {
  63       s[len - 2] = 'н'; // replace with н
  64       len--;
  65     }
  66
  67     if (len > 5 && s[len - 2] == 'ъ') {
  68       s[len - 2] = s[len - 1]; // replace ъN with N
  69       len--;
  70     }
  71
  72     return len;
  73   }
  74
  75   /**
  76    * Mainly remove the definite article
  77    * @param s input buffer
  78    * @param len length of input buffer
  79    * @return new stemmed length
  80    */
  81   private int removeArticle(final char s[], final int len) {
  82     if (len > 6 && endsWith(s, len, "ият"))
  83       return len - 3;
  84
  85     if (len > 5) {
  86       if (endsWith(s, len, "ът") ||
  87           endsWith(s, len, "то") ||
  88           endsWith(s, len, "те") ||
  89           endsWith(s, len, "та") ||
  90           endsWith(s, len, "ия"))
  91         return len - 2;
  92     }
  93
  94     if (len > 4 && endsWith(s, len, "ят"))
  95       return len - 2;
  96
  97     return len;
  98   }
  99
 100   private int removePlural(final char s[], final int len) {
 101     if (len > 6) {
 102       if (endsWith(s, len, "овци"))
 103         return len - 3; // replace with о
 104       if (endsWith(s, len, "ове"))
 105         return len - 3;
 106       if (endsWith(s, len, "еве")) {
 107         s[len - 3] = 'й'; // replace with й
 108         return len - 2;
 109       }
 110     }
 111
 112     if (len > 5) {
 113       if (endsWith(s, len, "ища"))
 114         return len - 3;
 115       if (endsWith(s, len, "та"))
 116         return len - 2;
 117       if (endsWith(s, len, "ци")) {
 118         s[len - 2] = 'к'; // replace with к
 119         return len - 1;
 120       }
 121       if (endsWith(s, len, "зи")) {
 122         s[len - 2] = 'г'; // replace with г
 123         return len - 1;
 124       }
 125
 126       if (s[len - 3] == 'е' && s[len - 1] == 'и') {
 127         s[len - 3] = 'я'; // replace е with я, remove и
 128         return len - 1;
 129       }
 130     }
 131
 132     if (len > 4) {
 133       if (endsWith(s, len, "си")) {
 134         s[len - 2] = 'х'; // replace with х
 135         return len - 1;
 136       }
 137       if (endsWith(s, len, "и"))
 138         return len - 1;
 139     }
 140
 141     return len;
 142   }
 143 }