1 package org.apache.lucene.analysis.hunspell;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.util.List;
23 import org.apache.lucene.analysis.TokenFilter;
24 import org.apache.lucene.analysis.TokenStream;
25 import org.apache.lucene.analysis.hunspell.HunspellStemmer.Stem;
26 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
27 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
30 * TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word having multiple
31 * stems, this filter can emit multiple tokens for each consumed token
33 public final class HunspellStemFilter extends TokenFilter {
35 private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
36 private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
37 private final HunspellStemmer stemmer;
39 private List<Stem> buffer;
40 private State savedState;
42 private final boolean dedup;
45 * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
48 * @param input TokenStream whose tokens will be stemmed
49 * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
51 public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary) {
52 this(input, dictionary, true);
56 * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
59 * @param input TokenStream whose tokens will be stemmed
60 * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
61 * @param dedup true if only unique terms should be output.
63 public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup) {
66 this.stemmer = new HunspellStemmer(dictionary);
73 public boolean incrementToken() throws IOException {
74 if (buffer != null && !buffer.isEmpty()) {
75 Stem nextStem = buffer.remove(0);
76 restoreState(savedState);
77 posIncAtt.setPositionIncrement(0);
78 termAtt.copyBuffer(nextStem.getStem(), 0, nextStem.getStemLength());
79 termAtt.setLength(nextStem.getStemLength());
83 if (!input.incrementToken()) {
87 buffer = dedup ? stemmer.uniqueStems(termAtt.buffer(), termAtt.length()) : stemmer.stem(termAtt.buffer(), termAtt.length());
89 if (buffer.isEmpty()) { // we do not know this word, return it unchanged
93 Stem stem = buffer.remove(0);
94 termAtt.copyBuffer(stem.getStem(), 0, stem.getStemLength());
95 termAtt.setLength(stem.getStemLength());
97 if (!buffer.isEmpty()) {
98 savedState = captureState();
108 public void reset() throws IOException {