lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/PortugueseStemmer.java

   1 package org.apache.lucene.analysis.pt;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.util.Map;
  21
  22 /**
  23  * Portuguese stemmer implementing the RSLP (Removedor de Sufixos da Lingua Portuguesa)
  24  * algorithm. This is sometimes also referred to as the Orengo stemmer.
  25  *
  26  * @see RSLPStemmerBase
  27  */
  28 public class PortugueseStemmer extends RSLPStemmerBase {
  29   private static final Step plural, feminine, adverb, augmentative, noun, verb, vowel;
  30
  31   static {
  32     Map<String,Step> steps = parse(PortugueseStemmer.class, "portuguese.rslp");
  33     plural = steps.get("Plural");
  34     feminine = steps.get("Feminine");
  35     adverb = steps.get("Adverb");
  36     augmentative = steps.get("Augmentative");
  37     noun = steps.get("Noun");
  38     verb = steps.get("Verb");
  39     vowel = steps.get("Vowel");
  40   }
  41
  42   /**
  43    * @param s buffer, oversized to at least <code>len+1</code>
  44    * @param len initial valid length of buffer
  45    * @return new valid length, stemmed
  46    */
  47   public int stem(char s[], int len) {
  48     assert s.length >= len + 1 : "this stemmer requires an oversized array of at least 1";
  49
  50     len = plural.apply(s, len);
  51     len = adverb.apply(s, len);
  52     len = feminine.apply(s, len);
  53     len = augmentative.apply(s, len);
  54
  55     int oldlen = len;
  56     len = noun.apply(s, len);
  57
  58     if (len == oldlen) { /* suffix not removed */
  59       oldlen = len;
  60
  61       len = verb.apply(s, len);
  62
  63       if (len == oldlen) { /* suffix not removed */
  64         len = vowel.apply(s, len);
  65       }
  66     }
  67
  68     // rslp accent removal
  69     for (int i = 0; i < len; i++) {
  70       switch(s[i]) {
  71         case 'à':
  72         case 'á':
  73         case 'â':
  74         case 'ã':
  75         case 'ä':
  76         case 'å': s[i] = 'a'; break;
  77         case 'ç': s[i] = 'c'; break;
  78         case 'è':
  79         case 'é':
  80         case 'ê':
  81         case 'ë': s[i] = 'e'; break;
  82         case 'ì':
  83         case 'í':
  84         case 'î':
  85         case 'ï': s[i] = 'i'; break;
  86         case 'ñ': s[i] = 'n'; break;
  87         case 'ò':
  88         case 'ó':
  89         case 'ô':
  90         case 'õ':
  91         case 'ö': s[i] = 'o'; break;
  92         case 'ù':
  93         case 'ú':
  94         case 'û':
  95         case 'ü': s[i] = 'u'; break;
  96         case 'ý':
  97         case 'ÿ': s[i] = 'y'; break;
  98       }
  99     }
 100     return len;
 101   }
 102 }