lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java

   1 package org.apache.lucene.analysis.hunspell;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.FileInputStream;
  21 import java.io.IOException;
  22 import java.io.InputStream;
  23 import java.text.ParseException;
  24 import java.util.ArrayList;
  25 import java.util.Arrays;
  26 import java.util.Collections;
  27 import java.util.List;
  28 import java.util.Scanner;
  29
  30 import org.apache.lucene.analysis.CharArraySet;
  31 import org.apache.lucene.util.CharacterUtils;
  32 import org.apache.lucene.util.Version;
  33
  34 /**
  35  * HunspellStemmer uses the affix rules declared in the HunspellDictionary to generate one or more stems for a word.  It
  36  * conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping.
  37  */
  38 public class HunspellStemmer {
  39
  40   private static final int RECURSION_CAP = 2;
  41
  42   private final HunspellDictionary dictionary;
  43   private final StringBuilder segment = new StringBuilder();
  44   private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_34);
  45
  46   /**
  47    * Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems
  48    *
  49    * @param dictionary HunspellDictionary that will be used to create the stems
  50    */
  51   public HunspellStemmer(HunspellDictionary dictionary) {
  52     this.dictionary = dictionary;
  53   }
  54
  55   /**
  56    * Find the stem(s) of the provided word
  57    *
  58    * @param word Word to find the stems for
  59    * @return List of stems for the word
  60    */
  61   public List<Stem> stem(String word) {
  62     return stem(word.toCharArray(), word.length());
  63   }
  64
  65   /**
  66    * Find the stem(s) of the provided word
  67    *
  68    * @param word Word to find the stems for
  69    * @return List of stems for the word
  70    */
  71   public List<Stem> stem(char word[], int length) {
  72     List<Stem> stems = new ArrayList<Stem>();
  73     if (dictionary.lookupWord(word, 0, length) != null) {
  74       stems.add(new Stem(word, length));
  75     }
  76     stems.addAll(stem(word, length, null, 0));
  77     return stems;
  78   }
  79
  80   /**
  81    * Find the unique stem(s) of the provided word
  82    *
  83    * @param word Word to find the stems for
  84    * @return List of stems for the word
  85    */
  86   public List<Stem> uniqueStems(char word[], int length) {
  87     List<Stem> stems = new ArrayList<Stem>();
  88     CharArraySet terms = new CharArraySet(dictionary.getVersion(), 8, dictionary.isIgnoreCase());
  89     if (dictionary.lookupWord(word, 0, length) != null) {
  90       stems.add(new Stem(word, length));
  91       terms.add(word);
  92     }
  93     List<Stem> otherStems = stem(word, length, null, 0);
  94     for (Stem s : otherStems) {
  95       if (!terms.contains(s.stem)) {
  96         stems.add(s);
  97         terms.add(s.stem);
  98       }
  99     }
 100     return stems;
 101   }
 102
 103   // ================================================= Helper Methods ================================================
 104
 105   /**
 106    * Generates a list of stems for the provided word
 107    *
 108    * @param word Word to generate the stems for
 109    * @param flags Flags from a previous stemming step that need to be cross-checked with any affixes in this recursive step
 110    * @param recursionDepth Level of recursion this stemming step is at
 111    * @return List of stems, pr an empty if no stems are found
 112    */
 113   private List<Stem> stem(char word[], int length, char[] flags, int recursionDepth) {
 114     List<Stem> stems = new ArrayList<Stem>();
 115
 116     for (int i = 0; i < length; i++) {
 117       List<HunspellAffix> suffixes = dictionary.lookupSuffix(word, i, length - i);
 118       if (suffixes == null) {
 119         continue;
 120       }
 121
 122       for (HunspellAffix suffix : suffixes) {
 123         if (hasCrossCheckedFlag(suffix.getFlag(), flags)) {
 124           int deAffixedLength = length - suffix.getAppend().length();
 125           // TODO: can we do this in-place?
 126           String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString();
 127
 128           List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
 129           for (Stem stem : stemList) {
 130             stem.addSuffix(suffix);
 131           }
 132
 133           stems.addAll(stemList);
 134         }
 135       }
 136     }
 137
 138     for (int i = length - 1; i >= 0; i--) {
 139       List<HunspellAffix> prefixes = dictionary.lookupPrefix(word, 0, i);
 140       if (prefixes == null) {
 141         continue;
 142       }
 143
 144       for (HunspellAffix prefix : prefixes) {
 145         if (hasCrossCheckedFlag(prefix.getFlag(), flags)) {
 146           int deAffixedStart = prefix.getAppend().length();
 147           int deAffixedLength = length - deAffixedStart;
 148
 149           String strippedWord = new StringBuilder().append(prefix.getStrip())
 150               .append(word, deAffixedStart, deAffixedLength)
 151               .toString();
 152
 153           List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth);
 154           for (Stem stem : stemList) {
 155             stem.addPrefix(prefix);
 156           }
 157
 158           stems.addAll(stemList);
 159         }
 160       }
 161     }
 162
 163     return stems;
 164   }
 165
 166   /**
 167    * Applies the affix rule to the given word, producing a list of stems if any are found
 168    *
 169    * @param strippedWord Word the affix has been removed and the strip added
 170    * @param affix HunspellAffix representing the affix rule itself
 171    * @param recursionDepth Level of recursion this stemming step is at
 172    * @return List of stems for the word, or an empty list if none are found
 173    */
 174   @SuppressWarnings("unchecked")
 175   public List<Stem> applyAffix(char strippedWord[], int length, HunspellAffix affix, int recursionDepth) {
 176     if(dictionary.isIgnoreCase()) {
 177       for(int i=0;i<strippedWord.length;){
 178         i += Character.toChars(
 179               Character.toLowerCase(charUtils.codePointAt(strippedWord, i)), strippedWord, i);
 180       }
 181     }
 182     segment.setLength(0);
 183     segment.append(strippedWord, 0, length);
 184     if (!affix.checkCondition(segment)) {
 185       return Collections.EMPTY_LIST;
 186     }
 187
 188     List<Stem> stems = new ArrayList<Stem>();
 189
 190     List<HunspellWord> words = dictionary.lookupWord(strippedWord, 0, length);
 191     if (words != null) {
 192       for (HunspellWord hunspellWord : words) {
 193         if (hunspellWord.hasFlag(affix.getFlag())) {
 194           stems.add(new Stem(strippedWord, length));
 195         }
 196       }
 197     }
 198
 199     if (affix.isCrossProduct() && recursionDepth < RECURSION_CAP) {
 200       stems.addAll(stem(strippedWord, length, affix.getAppendFlags(), ++recursionDepth));
 201     }
 202
 203     return stems;
 204   }
 205
 206   /**
 207    * Checks if the given flag cross checks with the given array of flags
 208    *
 209    * @param flag Flag to cross check with the array of flags
 210    * @param flags Array of flags to cross check against.  Can be {@code null}
 211    * @return {@code true} if the flag is found in the array or the array is {@code null}, {@code false} otherwise
 212    */
 213   private boolean hasCrossCheckedFlag(char flag, char[] flags) {
 214     return flags == null || Arrays.binarySearch(flags, flag) >= 0;
 215   }
 216
 217   /**
 218    * Stem represents all information known about a stem of a word.  This includes the stem, and the prefixes and suffixes
 219    * that were used to change the word into the stem.
 220    */
 221   public static class Stem {
 222
 223     private final List<HunspellAffix> prefixes = new ArrayList<HunspellAffix>();
 224     private final List<HunspellAffix> suffixes = new ArrayList<HunspellAffix>();
 225     private final char stem[];
 226     private final int stemLength;
 227
 228     /**
 229      * Creates a new Stem wrapping the given word stem
 230      *
 231      * @param stem Stem of a word
 232      */
 233     public Stem(char stem[], int stemLength) {
 234       this.stem = stem;
 235       this.stemLength = stemLength;
 236     }
 237
 238     /**
 239      * Adds a prefix to the list of prefixes used to generate this stem.  Because it is assumed that prefixes are added
 240      * depth first, the prefix is added to the front of the list
 241      *
 242      * @param prefix Prefix to add to the list of prefixes for this stem
 243      */
 244     public void addPrefix(HunspellAffix prefix) {
 245       prefixes.add(0, prefix);
 246     }
 247
 248     /**
 249      * Adds a suffix to the list of suffixes used to generate this stem.  Because it is assumed that suffixes are added
 250      * depth first, the suffix is added to the end of the list
 251      *
 252      * @param suffix Suffix to add to the list of suffixes for this stem
 253      */
 254     public void addSuffix(HunspellAffix suffix) {
 255       suffixes.add(suffix);
 256     }
 257
 258     /**
 259      * Returns the list of prefixes used to generate the stem
 260      *
 261      * @return List of prefixes used to generate the stem or an empty list if no prefixes were required
 262      */
 263     public List<HunspellAffix> getPrefixes() {
 264       return prefixes;
 265     }
 266
 267     /**
 268      * Returns the list of suffixes used to generate the stem
 269      *
 270      * @return List of suffixes used to generate the stem or an empty list if no suffixes were required
 271      */
 272     public List<HunspellAffix> getSuffixes() {
 273       return suffixes;
 274     }
 275
 276     /**
 277      * Returns the actual word stem itself
 278      *
 279      * @return Word stem itself
 280      */
 281     public char[] getStem() {
 282       return stem;
 283     }
 284
 285     /**
 286      * @return the stemLength
 287      */
 288     public int getStemLength() {
 289       return stemLength;
 290     }
 291
 292     public String getStemString() {
 293       return new String(stem, 0, stemLength);
 294     }
 295
 296   }
 297
 298
 299   // ================================================= Entry Point ===================================================
 300
 301   /**
 302    * HunspellStemmer entry point.  Accepts two arguments: location of affix file and location of dic file
 303    *
 304    * @param args Program arguments.  Should contain location of affix file and location of dic file
 305    * @throws IOException Can be thrown while reading from the files
 306    * @throws ParseException Can be thrown while parsing the files
 307    */
 308   public static void main(String[] args) throws IOException, ParseException {
 309     boolean ignoreCase = false;
 310     int offset = 0;
 311
 312     if (args.length < 2) {
 313       System.out.println("usage: HunspellStemmer [-i] <affix location> <dic location>");
 314       System.exit(1);
 315     }
 316
 317     if(args[offset].equals("-i")) {
 318       ignoreCase = true;
 319       System.out.println("Ignoring case. All stems will be returned lowercased");
 320       offset++;
 321     }
 322
 323     InputStream affixInputStream = new FileInputStream(args[offset++]);
 324     InputStream dicInputStream = new FileInputStream(args[offset++]);
 325
 326     HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_34, ignoreCase);
 327
 328     affixInputStream.close();
 329     dicInputStream.close();
 330
 331     HunspellStemmer stemmer = new HunspellStemmer(dictionary);
 332
 333     Scanner scanner = new Scanner(System.in);
 334
 335     System.out.print("> ");
 336     while (scanner.hasNextLine()) {
 337       String word = scanner.nextLine();
 338
 339       if ("exit".equals(word)) {
 340         break;
 341       }
 342
 343       printStemResults(word, stemmer.stem(word.toCharArray(), word.length()));
 344
 345       System.out.print("> ");
 346     }
 347   }
 348
 349   /**
 350    * Prints the results of the stemming of a word
 351    *
 352    * @param originalWord Word that has been stemmed
 353    * @param stems Stems of the word
 354    */
 355   private static void printStemResults(String originalWord, List<Stem> stems) {
 356     StringBuilder builder = new StringBuilder().append("stem(").append(originalWord).append(")").append("\n");
 357
 358     for (Stem stem : stems) {
 359       builder.append("- ").append(stem.getStem()).append(": ");
 360
 361       for (HunspellAffix prefix : stem.getPrefixes()) {
 362         builder.append(prefix.getAppend()).append("+");
 363
 364         if (hasText(prefix.getStrip())) {
 365           builder.append(prefix.getStrip()).append("-");
 366         }
 367       }
 368
 369       builder.append(stem.getStem());
 370
 371       for (HunspellAffix suffix : stem.getSuffixes()) {
 372         if (hasText(suffix.getStrip())) {
 373           builder.append("-").append(suffix.getStrip());
 374         }
 375
 376         builder.append("+").append(suffix.getAppend());
 377       }
 378       builder.append("\n");
 379     }
 380
 381     System.out.println(builder);
 382   }
 383
 384   /**
 385    * Simple utility to check if the given String has any text
 386    *
 387    * @param str String to check if it has any text
 388    * @return {@code true} if the String has text, {@code false} otherwise
 389    */
 390   private static boolean hasText(String str) {
 391     return str != null && str.length() > 0;
 392   }
 393 }