lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java

   1 package org.apache.lucene.analysis.pt;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 /*
  21  * This algorithm is updated based on code located at:
  22  * http://members.unine.ch/jacques.savoy/clef/
  23  *
  24  * Full copyright for that code follows:
  25  */
  26
  27 /*
  28  * Copyright (c) 2005, Jacques Savoy
  29  * All rights reserved.
  30  *
  31  * Redistribution and use in source and binary forms, with or without
  32  * modification, are permitted provided that the following conditions are met:
  33  *
  34  * Redistributions of source code must retain the above copyright notice, this
  35  * list of conditions and the following disclaimer. Redistributions in binary
  36  * form must reproduce the above copyright notice, this list of conditions and
  37  * the following disclaimer in the documentation and/or other materials
  38  * provided with the distribution. Neither the name of the author nor the names
  39  * of its contributors may be used to endorse or promote products derived from
  40  * this software without specific prior written permission.
  41  *
  42  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  43  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  45  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  46  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  47  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  48  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  49  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  50  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  51  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  52  * POSSIBILITY OF SUCH DAMAGE.
  53  */
  54
  55 import static org.apache.lucene.analysis.util.StemmerUtil.*;
  56
  57 /**
  58  * Light Stemmer for Portuguese
  59  * <p>
  60  * This stemmer implements the "UniNE" algorithm in:
  61  * <i>Light Stemming Approaches for the French, Portuguese, German and Hungarian Languages</i>
  62  * Jacques Savoy
  63  */
  64 public class PortugueseLightStemmer {
  65
  66   public int stem(char s[], int len) {
  67     if (len < 4)
  68       return len;
  69
  70     len = removeSuffix(s, len);
  71
  72     if (len > 3 && s[len-1] == 'a')
  73       len = normFeminine(s, len);
  74
  75     if (len > 4)
  76       switch(s[len-1]) {
  77         case 'e':
  78         case 'a':
  79         case 'o': len--; break;
  80       }
  81
  82     for (int i = 0; i < len; i++)
  83       switch(s[i]) {
  84         case 'à':
  85         case 'á':
  86         case 'â':
  87         case 'ä':
  88         case 'ã': s[i] = 'a'; break;
  89         case 'ò':
  90         case 'ó':
  91         case 'ô':
  92         case 'ö':
  93         case 'õ': s[i] = 'o'; break;
  94         case 'è':
  95         case 'é':
  96         case 'ê':
  97         case 'ë': s[i] = 'e'; break;
  98         case 'ù':
  99         case 'ú':
 100         case 'û':
 101         case 'ü': s[i] = 'u'; break;
 102         case 'ì':
 103         case 'í':
 104         case 'î':
 105         case 'ï': s[i] = 'i'; break;
 106         case 'ç': s[i] = 'c'; break;
 107       }
 108
 109     return len;
 110   }
 111
 112   private int removeSuffix(char s[], int len) {
 113     if (len > 4 && endsWith(s, len, "es"))
 114       switch(s[len-3]) {
 115         case 'r':
 116         case 's':
 117         case 'l':
 118         case 'z': return len - 2;
 119       }
 120
 121     if (len > 3 && endsWith(s, len, "ns")) {
 122       s[len - 2] = 'm';
 123       return len - 1;
 124     }
 125
 126     if (len > 4 && (endsWith(s, len, "eis") || endsWith(s, len, "éis"))) {
 127       s[len - 3] = 'e';
 128       s[len - 2] = 'l';
 129       return len - 1;
 130     }
 131
 132     if (len > 4 && endsWith(s, len, "ais")) {
 133       s[len - 2] = 'l';
 134       return len - 1;
 135     }
 136
 137     if (len > 4 && endsWith(s, len, "óis")) {
 138       s[len - 3] = 'o';
 139       s[len - 2] = 'l';
 140       return len - 1;
 141     }
 142
 143     if (len > 4 && endsWith(s, len, "is")) {
 144       s[len - 1] = 'l';
 145       return len;
 146     }
 147
 148     if (len > 3 &&
 149         (endsWith(s, len, "ões") ||
 150          endsWith(s, len, "ães"))) {
 151       len--;
 152       s[len - 2] = 'ã';
 153       s[len - 1] = 'o';
 154       return len;
 155     }
 156
 157     if (len > 6 && endsWith(s, len, "mente"))
 158       return len - 5;
 159
 160     if (len > 3 && s[len-1] == 's')
 161       return len - 1;
 162     return len;
 163   }
 164
 165   private int normFeminine(char s[], int len) {
 166     if (len > 7 &&
 167         (endsWith(s, len, "inha") ||
 168          endsWith(s, len, "iaca") ||
 169          endsWith(s, len, "eira"))) {
 170       s[len - 1] = 'o';
 171       return len;
 172     }
 173
 174     if (len > 6) {
 175       if (endsWith(s, len, "osa") ||
 176           endsWith(s, len, "ica") ||
 177           endsWith(s, len, "ida") ||
 178           endsWith(s, len, "ada") ||
 179           endsWith(s, len, "iva") ||
 180           endsWith(s, len, "ama")) {
 181         s[len - 1] = 'o';
 182         return len;
 183       }
 184
 185       if (endsWith(s, len, "ona")) {
 186         s[len - 3] = 'ã';
 187         s[len - 2] = 'o';
 188         return len - 1;
 189       }
 190
 191       if (endsWith(s, len, "ora"))
 192         return len - 1;
 193
 194       if (endsWith(s, len, "esa")) {
 195         s[len - 3] = 'ê';
 196         return len - 1;
 197       }
 198
 199       if (endsWith(s, len, "na")) {
 200         s[len - 1] = 'o';
 201         return len;
 202       }
 203     }
 204     return len;
 205   }
 206 }