lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java

   1 package org.apache.lucene.analysis.hunspell;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.util.List;
  22
  23 import org.apache.lucene.analysis.TokenFilter;
  24 import org.apache.lucene.analysis.TokenStream;
  25 import org.apache.lucene.analysis.hunspell.HunspellStemmer.Stem;
  26 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  27 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
  28
  29 /**
  30  * TokenFilter that uses hunspell affix rules and words to stem tokens.  Since hunspell supports a word having multiple
  31  * stems, this filter can emit multiple tokens for each consumed token
  32  */
  33 public final class HunspellStemFilter extends TokenFilter {
  34
  35   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  36   private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
  37   private final HunspellStemmer stemmer;
  38
  39   private List<Stem> buffer;
  40   private State savedState;
  41
  42   private final boolean dedup;
  43
  44   /**
  45    * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
  46    * HunspellDictionary
  47    *
  48    * @param input TokenStream whose tokens will be stemmed
  49    * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
  50    */
  51   public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary) {
  52     this(input, dictionary, true);
  53   }
  54
  55   /**
  56    * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
  57    * HunspellDictionary
  58    *
  59    * @param input TokenStream whose tokens will be stemmed
  60    * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
  61    * @param dedup true if only unique terms should be output.
  62    */
  63   public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup) {
  64     super(input);
  65     this.dedup = dedup;
  66     this.stemmer = new HunspellStemmer(dictionary);
  67   }
  68
  69   /**
  70    * {@inheritDoc}
  71    */
  72   @Override
  73   public boolean incrementToken() throws IOException {
  74     if (buffer != null && !buffer.isEmpty()) {
  75       Stem nextStem = buffer.remove(0);
  76       restoreState(savedState);
  77       posIncAtt.setPositionIncrement(0);
  78       termAtt.copyBuffer(nextStem.getStem(), 0, nextStem.getStemLength());
  79       termAtt.setLength(nextStem.getStemLength());
  80       return true;
  81     }
  82
  83     if (!input.incrementToken()) {
  84       return false;
  85     }
  86
  87     buffer = dedup ? stemmer.uniqueStems(termAtt.buffer(), termAtt.length()) : stemmer.stem(termAtt.buffer(), termAtt.length());
  88
  89     if (buffer.isEmpty()) { // we do not know this word, return it unchanged
  90       return true;
  91     }
  92
  93     Stem stem = buffer.remove(0);
  94     termAtt.copyBuffer(stem.getStem(), 0, stem.getStemLength());
  95     termAtt.setLength(stem.getStemLength());
  96
  97     if (!buffer.isEmpty()) {
  98       savedState = captureState();
  99     }
 100
 101     return true;
 102   }
 103
 104   /**
 105    * {@inheritDoc}
 106    */
 107   @Override
 108   public void reset() throws IOException {
 109     super.reset();
 110     buffer = null;
 111   }
 112 }