lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/RSLPStemmerBase.java

   1 package org.apache.lucene.analysis.pt;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.io.InputStream;
  22 import java.io.InputStreamReader;
  23 import java.io.LineNumberReader;
  24 import java.util.ArrayList;
  25 import java.util.Arrays;
  26 import java.util.HashMap;
  27 import java.util.List;
  28 import java.util.Map;
  29 import java.util.regex.Matcher;
  30 import java.util.regex.Pattern;
  31
  32 import org.apache.lucene.analysis.CharArraySet;
  33 import org.apache.lucene.util.Version;
  34
  35 import static org.apache.lucene.analysis.util.StemmerUtil.*;
  36
  37 /**
  38  * Base class for stemmers that use a set of RSLP-like stemming steps.
  39  * <p>
  40  * RSLP (Removedor de Sufixos da Lingua Portuguesa) is an algorithm designed
  41  * originally for stemming the Portuguese language, described in the paper
  42  * <i>A Stemming Algorithm for the Portuguese Language</i>, Orengo et. al.
  43  * <p>
  44  * Since this time a plural-only modification (RSLP-S) as well as a modification
  45  * for the Galician language have been implemented. This class parses a configuration
  46  * file that describes {@link Step}s, where each Step contains a set of {@link Rule}s.
  47  * <p>
  48  * The general rule format is:
  49  * <blockquote>{ "suffix", N, "replacement", { "exception1", "exception2", ...}}</blockquote>
  50  * where:
  51  * <ul>
  52  *   <li><code>suffix</code> is the suffix to be removed (such as "inho").
  53  *   <li><code>N</code> is the min stem size, where stem is defined as the candidate stem
  54  *       after removing the suffix (but before appending the replacement!)
  55  *   <li><code>replacement</code> is an optimal string to append after removing the suffix.
  56  *       This can be the empty string.
  57  *   <li><code>exceptions</code> is an optional list of exceptions, patterns that should
  58  *       not be stemmed. These patterns can be specified as whole word or suffix (ends-with)
  59  *       patterns, depending upon the exceptions format flag in the step header.
  60  * </ul>
  61  * <p>
  62  * A step is an ordered list of rules, with a structure in this format:
  63  * <blockquote>{ "name", N, B, { "cond1", "cond2", ... }
  64  *               ... rules ... };
  65  * </blockquote>
  66  * where:
  67  * <ul>
  68  *   <li><code>name</code> is a name for the step (such as "Plural").
  69  *   <li><code>N</code> is the min word size. Words that are less than this length bypass
  70  *       the step completely, as an optimization. Note: N can be zero, in this case this
  71  *       implementation will automatically calculate the appropriate value from the underlying
  72  *       rules.
  73  *   <li><code>B</code> is a "boolean" flag specifying how exceptions in the rules are matched.
  74  *       A value of 1 indicates whole-word pattern matching, a value of 0 indicates that
  75  *       exceptions are actually suffixes and should be matched with ends-with.
  76  *   <li><code>conds</code> are an optional list of conditions to enter the step at all. If
  77  *       the list is non-empty, then a word must end with one of these conditions or it will
  78  *       bypass the step completely as an optimization.
  79  * </ul>
  80  * <p>
  81  * @see <a href="http://www.inf.ufrgs.br/~viviane/rslp/index.htm">RSLP description</a>
  82  * @lucene.internal
  83  */
  84 public abstract class RSLPStemmerBase {
  85
  86   /**
  87    * A basic rule, with no exceptions.
  88    */
  89   protected static class Rule {
  90     protected final char suffix[];
  91     protected final char replacement[];
  92     protected final int min;
  93
  94     /**
  95      * Create a rule.
  96      * @param suffix suffix to remove
  97      * @param min minimum stem length
  98      * @param replacement replacement string
  99      */
 100     public Rule(String suffix, int min, String replacement) {
 101       this.suffix = suffix.toCharArray();
 102       this.replacement = replacement.toCharArray();
 103       this.min = min;
 104     }
 105
 106     /**
 107      * @return true if the word matches this rule.
 108      */
 109     public boolean matches(char s[], int len) {
 110       return (len - suffix.length >= min && endsWith(s, len, suffix));
 111     }
 112
 113     /**
 114      * @return new valid length of the string after firing this rule.
 115      */
 116     public int replace(char s[], int len) {
 117       if (replacement.length > 0) {
 118         System.arraycopy(replacement, 0, s, len - suffix.length, replacement.length);
 119       }
 120       return len - suffix.length + replacement.length;
 121     }
 122   }
 123
 124   /**
 125    * A rule with a set of whole-word exceptions.
 126    */
 127   protected static class RuleWithSetExceptions extends Rule {
 128     protected final CharArraySet exceptions;
 129
 130     public RuleWithSetExceptions(String suffix, int min, String replacement,
 131         String[] exceptions) {
 132       super(suffix, min, replacement);
 133       for (int i = 0; i < exceptions.length; i++) {
 134         if (!exceptions[i].endsWith(suffix))
 135           System.err.println("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
 136       }
 137       this.exceptions = new CharArraySet(Version.LUCENE_31,
 138            Arrays.asList(exceptions), false);
 139     }
 140
 141     @Override
 142     public boolean matches(char s[], int len) {
 143       return super.matches(s, len) && !exceptions.contains(s, 0, len);
 144     }
 145   }
 146
 147   /**
 148    * A rule with a set of exceptional suffixes.
 149    */
 150   protected static class RuleWithSuffixExceptions extends Rule {
 151     // TODO: use a more efficient datastructure: automaton?
 152     protected final char[][] exceptions;
 153
 154     public RuleWithSuffixExceptions(String suffix, int min, String replacement,
 155         String[] exceptions) {
 156       super(suffix, min, replacement);
 157       for (int i = 0; i < exceptions.length; i++) {
 158         if (!exceptions[i].endsWith(suffix))
 159           System.err.println("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
 160       }
 161       this.exceptions = new char[exceptions.length][];
 162       for (int i = 0; i < exceptions.length; i++)
 163         this.exceptions[i] = exceptions[i].toCharArray();
 164     }
 165
 166     @Override
 167     public boolean matches(char s[], int len) {
 168       if (!super.matches(s, len))
 169         return false;
 170
 171       for (int i = 0; i < exceptions.length; i++)
 172         if (endsWith(s, len, exceptions[i]))
 173           return false;
 174
 175       return true;
 176     }
 177   }
 178
 179   /**
 180    * A step containing a list of rules.
 181    */
 182   protected static class Step {
 183     protected final String name;
 184     protected final Rule rules[];
 185     protected final int min;
 186     protected final char[][] suffixes;
 187
 188     /**
 189      * Create a new step
 190      * @param name Step's name.
 191      * @param rules an ordered list of rules.
 192      * @param min minimum word size. if this is 0 it is automatically calculated.
 193      * @param suffixes optional list of conditional suffixes. may be null.
 194      */
 195     public Step(String name, Rule rules[], int min, String suffixes[]) {
 196       this.name = name;
 197       this.rules = rules;
 198       if (min == 0) {
 199         min = Integer.MAX_VALUE;
 200         for (Rule r : rules)
 201           min = Math.min(min, r.min + r.suffix.length);
 202       }
 203       this.min = min;
 204
 205       if (suffixes == null || suffixes.length == 0) {
 206         this.suffixes = null;
 207       } else {
 208         this.suffixes = new char[suffixes.length][];
 209         for (int i = 0; i < suffixes.length; i++)
 210           this.suffixes[i] = suffixes[i].toCharArray();
 211       }
 212     }
 213
 214     /**
 215      * @return new valid length of the string after applying the entire step.
 216      */
 217     public int apply(char s[], int len) {
 218       if (len < min)
 219         return len;
 220
 221       if (suffixes != null) {
 222         boolean found = false;
 223
 224         for (int i = 0; i < suffixes.length; i++)
 225           if (endsWith(s, len, suffixes[i])) {
 226             found = true;
 227             break;
 228           }
 229
 230         if (!found) return len;
 231       }
 232
 233       for (int i = 0; i < rules.length; i++) {
 234         if (rules[i].matches(s, len))
 235           return rules[i].replace(s, len);
 236       }
 237
 238       return len;
 239     }
 240   }
 241
 242   /**
 243    * Parse a resource file into an RSLP stemmer description.
 244    * @return a Map containing the named Steps in this description.
 245    */
 246   protected static Map<String,Step> parse(Class<? extends RSLPStemmerBase> clazz, String resource) {
 247     // TODO: this parser is ugly, but works. use a jflex grammar instead.
 248     try {
 249       InputStream is = clazz.getResourceAsStream(resource);
 250       LineNumberReader r = new LineNumberReader(new InputStreamReader(is, "UTF-8"));
 251       Map<String,Step> steps = new HashMap<String,Step>();
 252       String step;
 253       while ((step = readLine(r)) != null) {
 254         Step s = parseStep(r, step);
 255         steps.put(s.name, s);
 256       }
 257       r.close();
 258       return steps;
 259     } catch (IOException e) {
 260       throw new RuntimeException(e);
 261     }
 262   }
 263
 264   private static final Pattern headerPattern =
 265     Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*(0|1),\\s*\\{(.*)\\},\\s*$");
 266   private static final Pattern stripPattern =
 267     Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+)\\s*\\}\\s*(,|(\\}\\s*;))$");
 268   private static final Pattern repPattern =
 269     Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*\"([^\"]*)\"\\}\\s*(,|(\\}\\s*;))$");
 270   private static final Pattern excPattern =
 271     Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*\"([^\"]*)\",\\s*\\{(.*)\\}\\s*\\}\\s*(,|(\\}\\s*;))$");
 272
 273   private static Step parseStep(LineNumberReader r, String header) throws IOException {
 274     Matcher matcher = headerPattern.matcher(header);
 275     if (!matcher.find()) {
 276       throw new RuntimeException("Illegal Step header specified at line " + r.getLineNumber());
 277     }
 278     assert matcher.groupCount() == 4;
 279     String name = matcher.group(1);
 280     int min = Integer.parseInt(matcher.group(2));
 281     int type = Integer.parseInt(matcher.group(3));
 282     String suffixes[] = parseList(matcher.group(4));
 283     Rule rules[] = parseRules(r, type);
 284     return new Step(name, rules, min, suffixes);
 285   }
 286
 287   private static Rule[] parseRules(LineNumberReader r, int type) throws IOException {
 288     List<Rule> rules = new ArrayList<Rule>();
 289     String line;
 290     while ((line = readLine(r)) != null) {
 291       Matcher matcher = stripPattern.matcher(line);
 292       if (matcher.matches()) {
 293         rules.add(new Rule(matcher.group(1), Integer.parseInt(matcher.group(2)), ""));
 294       } else {
 295         matcher = repPattern.matcher(line);
 296         if (matcher.matches()) {
 297           rules.add(new Rule(matcher.group(1), Integer.parseInt(matcher.group(2)), matcher.group(3)));
 298         } else {
 299           matcher = excPattern.matcher(line);
 300           if (matcher.matches()) {
 301             if (type == 0) {
 302               rules.add(new RuleWithSuffixExceptions(matcher.group(1),
 303                         Integer.parseInt(matcher.group(2)),
 304                         matcher.group(3),
 305                         parseList(matcher.group(4))));
 306             } else {
 307               rules.add(new RuleWithSetExceptions(matcher.group(1),
 308                         Integer.parseInt(matcher.group(2)),
 309                         matcher.group(3),
 310                         parseList(matcher.group(4))));
 311             }
 312           } else {
 313             throw new RuntimeException("Illegal Step rule specified at line " + r.getLineNumber());
 314           }
 315         }
 316       }
 317       if (line.endsWith(";"))
 318         return rules.toArray(new Rule[rules.size()]);
 319     }
 320     return null;
 321   }
 322
 323   private static String[] parseList(String s) {
 324     if (s.length() == 0)
 325       return null;
 326     String list[] = s.split(",");
 327     for (int i = 0; i < list.length; i++)
 328       list[i] = parseString(list[i].trim());
 329     return list;
 330   }
 331
 332   private static String parseString(String s) {
 333     return s.substring(1, s.length()-1);
 334   }
 335
 336   private static String readLine(LineNumberReader r) throws IOException {
 337     String line = null;
 338     while ((line = r.readLine()) != null) {
 339       line = line.trim();
 340       if (line.length() > 0 && line.charAt(0) != '#')
 341         return line;
 342     }
 343     return line;
 344   }
 345 }