lucene-java-3.4.0/lucene/contrib/analyzers/stempel/src/java/org/apache/lucene/analysis/stempel/StempelFilter.java

   1 /**
   2  * Copyright 2004 The Apache Software Foundation
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License"); you may
   5  * not use this file except in compliance with the License. You may obtain a
   6  * copy of the License at
   7  *
   8  * http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  12  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  13  * License for the specific language governing permissions and limitations
  14  * under the License.
  15  */
  16
  17 package org.apache.lucene.analysis.stempel;
  18
  19 import java.io.IOException;
  20
  21 import org.apache.lucene.analysis.TokenFilter;
  22 import org.apache.lucene.analysis.TokenStream;
  23 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  24 import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
  25
  26 /**
  27  * Transforms the token stream as per the stemming algorithm.
  28  * <p>
  29  * Note: the input to the stemming filter must already be in lower case, so you
  30  * will need to use LowerCaseFilter or LowerCaseTokenizer farther down the
  31  * Tokenizer chain in order for this to work properly!
  32  */
  33 public final class StempelFilter extends TokenFilter {
  34   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  35   private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
  36   private final StempelStemmer stemmer;
  37   private final int minLength;
  38
  39   /**
  40    * Minimum length of input words to be processed. Shorter words are returned
  41    * unchanged.
  42    */
  43   public static final int DEFAULT_MIN_LENGTH = 3;
  44
  45   /**
  46    * Create filter using the supplied stemming table.
  47    *
  48    * @param in input token stream
  49    * @param stemmer stemmer
  50    */
  51   public StempelFilter(TokenStream in, StempelStemmer stemmer) {
  52     this(in, stemmer, DEFAULT_MIN_LENGTH);
  53   }
  54
  55   /**
  56    * Create filter using the supplied stemming table.
  57    *
  58    * @param in input token stream
  59    * @param stemmer stemmer
  60    * @param minLength For performance reasons words shorter than minLength
  61    * characters are not processed, but simply returned.
  62    */
  63   public StempelFilter(TokenStream in, StempelStemmer stemmer, int minLength) {
  64     super(in);
  65     this.stemmer = stemmer;
  66     this.minLength = minLength;
  67   }
  68
  69   /** Returns the next input Token, after being stemmed */
  70   @Override
  71   public boolean incrementToken() throws IOException {
  72     if (input.incrementToken()) {
  73       if (!keywordAtt.isKeyword() && termAtt.length() > minLength) {
  74         StringBuilder sb = stemmer.stem(termAtt);
  75         if (sb != null) // if we can't stem it, return unchanged
  76           termAtt.setEmpty().append(sb);
  77       }
  78       return true;
  79     } else {
  80       return false;
  81     }
  82   }
  83 }