1 package org.apache.lucene.analysis.hunspell;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.FileInputStream;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.text.ParseException;
24 import java.util.ArrayList;
25 import java.util.Arrays;
26 import java.util.Collections;
27 import java.util.List;
28 import java.util.Scanner;
30 import org.apache.lucene.analysis.CharArraySet;
31 import org.apache.lucene.util.CharacterUtils;
32 import org.apache.lucene.util.Version;
35 * HunspellStemmer uses the affix rules declared in the HunspellDictionary to generate one or more stems for a word. It
36 * conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping.
38 public class HunspellStemmer {
40 private static final int RECURSION_CAP = 2;
42 private final HunspellDictionary dictionary;
43 private final StringBuilder segment = new StringBuilder();
44 private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_34);
47 * Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems
49 * @param dictionary HunspellDictionary that will be used to create the stems
51 public HunspellStemmer(HunspellDictionary dictionary) {
52 this.dictionary = dictionary;
56 * Find the stem(s) of the provided word
58 * @param word Word to find the stems for
59 * @return List of stems for the word
61 public List<Stem> stem(String word) {
62 return stem(word.toCharArray(), word.length());
66 * Find the stem(s) of the provided word
68 * @param word Word to find the stems for
69 * @return List of stems for the word
71 public List<Stem> stem(char word[], int length) {
72 List<Stem> stems = new ArrayList<Stem>();
73 if (dictionary.lookupWord(word, 0, length) != null) {
74 stems.add(new Stem(word, length));
76 stems.addAll(stem(word, length, null, 0));
81 * Find the unique stem(s) of the provided word
83 * @param word Word to find the stems for
84 * @return List of stems for the word
86 public List<Stem> uniqueStems(char word[], int length) {
87 List<Stem> stems = new ArrayList<Stem>();
88 CharArraySet terms = new CharArraySet(dictionary.getVersion(), 8, dictionary.isIgnoreCase());
89 if (dictionary.lookupWord(word, 0, length) != null) {
90 stems.add(new Stem(word, length));
93 List<Stem> otherStems = stem(word, length, null, 0);
94 for (Stem s : otherStems) {
95 if (!terms.contains(s.stem)) {
103 // ================================================= Helper Methods ================================================
106 * Generates a list of stems for the provided word
108 * @param word Word to generate the stems for
109 * @param flags Flags from a previous stemming step that need to be cross-checked with any affixes in this recursive step
110 * @param recursionDepth Level of recursion this stemming step is at
111 * @return List of stems, pr an empty if no stems are found
113 private List<Stem> stem(char word[], int length, char[] flags, int recursionDepth) {
114 List<Stem> stems = new ArrayList<Stem>();
116 for (int i = 0; i < length; i++) {
117 List<HunspellAffix> suffixes = dictionary.lookupSuffix(word, i, length - i);
118 if (suffixes == null) {
122 for (HunspellAffix suffix : suffixes) {
123 if (hasCrossCheckedFlag(suffix.getFlag(), flags)) {
124 int deAffixedLength = length - suffix.getAppend().length();
125 // TODO: can we do this in-place?
126 String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString();
128 List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
129 for (Stem stem : stemList) {
130 stem.addSuffix(suffix);
133 stems.addAll(stemList);
138 for (int i = length - 1; i >= 0; i--) {
139 List<HunspellAffix> prefixes = dictionary.lookupPrefix(word, 0, i);
140 if (prefixes == null) {
144 for (HunspellAffix prefix : prefixes) {
145 if (hasCrossCheckedFlag(prefix.getFlag(), flags)) {
146 int deAffixedStart = prefix.getAppend().length();
147 int deAffixedLength = length - deAffixedStart;
149 String strippedWord = new StringBuilder().append(prefix.getStrip())
150 .append(word, deAffixedStart, deAffixedLength)
153 List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth);
154 for (Stem stem : stemList) {
155 stem.addPrefix(prefix);
158 stems.addAll(stemList);
167 * Applies the affix rule to the given word, producing a list of stems if any are found
169 * @param strippedWord Word the affix has been removed and the strip added
170 * @param affix HunspellAffix representing the affix rule itself
171 * @param recursionDepth Level of recursion this stemming step is at
172 * @return List of stems for the word, or an empty list if none are found
174 @SuppressWarnings("unchecked")
175 public List<Stem> applyAffix(char strippedWord[], int length, HunspellAffix affix, int recursionDepth) {
176 if(dictionary.isIgnoreCase()) {
177 for(int i=0;i<strippedWord.length;){
178 i += Character.toChars(
179 Character.toLowerCase(charUtils.codePointAt(strippedWord, i)), strippedWord, i);
182 segment.setLength(0);
183 segment.append(strippedWord, 0, length);
184 if (!affix.checkCondition(segment)) {
185 return Collections.EMPTY_LIST;
188 List<Stem> stems = new ArrayList<Stem>();
190 List<HunspellWord> words = dictionary.lookupWord(strippedWord, 0, length);
192 for (HunspellWord hunspellWord : words) {
193 if (hunspellWord.hasFlag(affix.getFlag())) {
194 stems.add(new Stem(strippedWord, length));
199 if (affix.isCrossProduct() && recursionDepth < RECURSION_CAP) {
200 stems.addAll(stem(strippedWord, length, affix.getAppendFlags(), ++recursionDepth));
207 * Checks if the given flag cross checks with the given array of flags
209 * @param flag Flag to cross check with the array of flags
210 * @param flags Array of flags to cross check against. Can be {@code null}
211 * @return {@code true} if the flag is found in the array or the array is {@code null}, {@code false} otherwise
213 private boolean hasCrossCheckedFlag(char flag, char[] flags) {
214 return flags == null || Arrays.binarySearch(flags, flag) >= 0;
218 * Stem represents all information known about a stem of a word. This includes the stem, and the prefixes and suffixes
219 * that were used to change the word into the stem.
221 public static class Stem {
223 private final List<HunspellAffix> prefixes = new ArrayList<HunspellAffix>();
224 private final List<HunspellAffix> suffixes = new ArrayList<HunspellAffix>();
225 private final char stem[];
226 private final int stemLength;
229 * Creates a new Stem wrapping the given word stem
231 * @param stem Stem of a word
233 public Stem(char stem[], int stemLength) {
235 this.stemLength = stemLength;
239 * Adds a prefix to the list of prefixes used to generate this stem. Because it is assumed that prefixes are added
240 * depth first, the prefix is added to the front of the list
242 * @param prefix Prefix to add to the list of prefixes for this stem
244 public void addPrefix(HunspellAffix prefix) {
245 prefixes.add(0, prefix);
249 * Adds a suffix to the list of suffixes used to generate this stem. Because it is assumed that suffixes are added
250 * depth first, the suffix is added to the end of the list
252 * @param suffix Suffix to add to the list of suffixes for this stem
254 public void addSuffix(HunspellAffix suffix) {
255 suffixes.add(suffix);
259 * Returns the list of prefixes used to generate the stem
261 * @return List of prefixes used to generate the stem or an empty list if no prefixes were required
263 public List<HunspellAffix> getPrefixes() {
268 * Returns the list of suffixes used to generate the stem
270 * @return List of suffixes used to generate the stem or an empty list if no suffixes were required
272 public List<HunspellAffix> getSuffixes() {
277 * Returns the actual word stem itself
279 * @return Word stem itself
281 public char[] getStem() {
286 * @return the stemLength
288 public int getStemLength() {
292 public String getStemString() {
293 return new String(stem, 0, stemLength);
299 // ================================================= Entry Point ===================================================
302 * HunspellStemmer entry point. Accepts two arguments: location of affix file and location of dic file
304 * @param args Program arguments. Should contain location of affix file and location of dic file
305 * @throws IOException Can be thrown while reading from the files
306 * @throws ParseException Can be thrown while parsing the files
308 public static void main(String[] args) throws IOException, ParseException {
309 boolean ignoreCase = false;
312 if (args.length < 2) {
313 System.out.println("usage: HunspellStemmer [-i] <affix location> <dic location>");
317 if(args[offset].equals("-i")) {
319 System.out.println("Ignoring case. All stems will be returned lowercased");
323 InputStream affixInputStream = new FileInputStream(args[offset++]);
324 InputStream dicInputStream = new FileInputStream(args[offset++]);
326 HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_34, ignoreCase);
328 affixInputStream.close();
329 dicInputStream.close();
331 HunspellStemmer stemmer = new HunspellStemmer(dictionary);
333 Scanner scanner = new Scanner(System.in);
335 System.out.print("> ");
336 while (scanner.hasNextLine()) {
337 String word = scanner.nextLine();
339 if ("exit".equals(word)) {
343 printStemResults(word, stemmer.stem(word.toCharArray(), word.length()));
345 System.out.print("> ");
350 * Prints the results of the stemming of a word
352 * @param originalWord Word that has been stemmed
353 * @param stems Stems of the word
355 private static void printStemResults(String originalWord, List<Stem> stems) {
356 StringBuilder builder = new StringBuilder().append("stem(").append(originalWord).append(")").append("\n");
358 for (Stem stem : stems) {
359 builder.append("- ").append(stem.getStem()).append(": ");
361 for (HunspellAffix prefix : stem.getPrefixes()) {
362 builder.append(prefix.getAppend()).append("+");
364 if (hasText(prefix.getStrip())) {
365 builder.append(prefix.getStrip()).append("-");
369 builder.append(stem.getStem());
371 for (HunspellAffix suffix : stem.getSuffixes()) {
372 if (hasText(suffix.getStrip())) {
373 builder.append("-").append(suffix.getStrip());
376 builder.append("+").append(suffix.getAppend());
378 builder.append("\n");
381 System.out.println(builder);
385 * Simple utility to check if the given String has any text
387 * @param str String to check if it has any text
388 * @return {@code true} if the String has text, {@code false} otherwise
390 private static boolean hasText(String str) {
391 return str != null && str.length() > 0;