1 package org.apache.lucene.analysis.snowball;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
22 import org.apache.lucene.analysis.TokenFilter;
23 import org.apache.lucene.analysis.TokenStream;
24 import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
25 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
26 import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; // javadoc @link
27 import org.apache.lucene.analysis.LowerCaseFilter; // javadoc @link
28 import org.tartarus.snowball.SnowballProgram;
31 * A filter that stems words using a Snowball-generated stemmer.
33 * Available stemmers are listed in {@link org.tartarus.snowball.ext}.
34 * <p><b>NOTE</b>: SnowballFilter expects lowercased text.
36 * <li>For the Turkish language, see {@link TurkishLowerCaseFilter}.
37 * <li>For other languages, see {@link LowerCaseFilter}.
41 public final class SnowballFilter extends TokenFilter {
43 private final SnowballProgram stemmer;
45 private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
46 private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
48 public SnowballFilter(TokenStream input, SnowballProgram stemmer) {
50 this.stemmer = stemmer;
54 * Construct the named stemming filter.
56 * Available stemmers are listed in {@link org.tartarus.snowball.ext}.
57 * The name of a stemmer is the part of the class name before "Stemmer",
58 * e.g., the stemmer in {@link org.tartarus.snowball.ext.EnglishStemmer} is named "English".
60 * @param in the input tokens to stem
61 * @param name the name of a stemmer
63 public SnowballFilter(TokenStream in, String name) {
66 Class<?> stemClass = Class.forName("org.tartarus.snowball.ext." + name + "Stemmer");
67 stemmer = (SnowballProgram) stemClass.newInstance();
68 } catch (Exception e) {
69 throw new RuntimeException(e.toString());
73 /** Returns the next input Token, after being stemmed */
75 public final boolean incrementToken() throws IOException {
76 if (input.incrementToken()) {
77 if (!keywordAttr.isKeyword()) {
78 char termBuffer[] = termAtt.buffer();
79 final int length = termAtt.length();
80 stemmer.setCurrent(termBuffer, length);
82 final char finalTerm[] = stemmer.getCurrentBuffer();
83 final int newLength = stemmer.getCurrentBufferLength();
84 if (finalTerm != termBuffer)
85 termAtt.copyBuffer(finalTerm, 0, newLength);
87 termAtt.setLength(newLength);