1 package org.apache.lucene.analysis.snowball;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import org.apache.lucene.analysis.*;
21 import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
22 import org.apache.lucene.analysis.standard.*;
23 import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter;
24 import org.apache.lucene.util.Version;
26 import java.io.IOException;
27 import java.io.Reader;
30 /** Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
31 * LowerCaseFilter}, {@link StopFilter} and {@link SnowballFilter}.
33 * Available stemmers are listed in org.tartarus.snowball.ext. The name of a
34 * stemmer is the part of the class name before "Stemmer", e.g., the stemmer in
35 * {@link org.tartarus.snowball.ext.EnglishStemmer} is named "English".
37 * <p><b>NOTE</b>: This class uses the same {@link Version}
38 * dependent settings as {@link StandardAnalyzer}, with the following addition:
40 * <li> As of 3.1, uses {@link TurkishLowerCaseFilter} for Turkish language.
43 * @deprecated Use the language-specific analyzer in contrib/analyzers instead.
44 * This analyzer will be removed in Lucene 5.0
47 public final class SnowballAnalyzer extends Analyzer {
49 private Set<?> stopSet;
50 private final Version matchVersion;
52 /** Builds the named analyzer with no stop words. */
53 public SnowballAnalyzer(Version matchVersion, String name) {
55 this.matchVersion = matchVersion;
59 * Builds the named analyzer with the given stop words.
60 * @deprecated Use {@link #SnowballAnalyzer(Version, String, Set)} instead.
63 public SnowballAnalyzer(Version matchVersion, String name, String[] stopWords) {
64 this(matchVersion, name);
65 stopSet = StopFilter.makeStopSet(matchVersion, stopWords);
68 /** Builds the named analyzer with the given stop words. */
69 public SnowballAnalyzer(Version matchVersion, String name, Set<?> stopWords) {
70 this(matchVersion, name);
71 stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion,
75 /** Constructs a {@link StandardTokenizer} filtered by a {@link
76 StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter},
77 and a {@link SnowballFilter} */
79 public TokenStream tokenStream(String fieldName, Reader reader) {
80 TokenStream result = new StandardTokenizer(matchVersion, reader);
81 result = new StandardFilter(matchVersion, result);
82 // remove the possessive 's for english stemmers
83 if (matchVersion.onOrAfter(Version.LUCENE_31) &&
84 (name.equals("English") || name.equals("Porter") || name.equals("Lovins")))
85 result = new EnglishPossessiveFilter(result);
86 // Use a special lowercase filter for turkish, the stemmer expects it.
87 if (matchVersion.onOrAfter(Version.LUCENE_31) && name.equals("Turkish"))
88 result = new TurkishLowerCaseFilter(result);
90 result = new LowerCaseFilter(matchVersion, result);
92 result = new StopFilter(matchVersion,
94 result = new SnowballFilter(result, name);
98 private class SavedStreams {
103 /** Returns a (possibly reused) {@link StandardTokenizer} filtered by a
104 * {@link StandardFilter}, a {@link LowerCaseFilter},
105 * a {@link StopFilter}, and a {@link SnowballFilter} */
107 public TokenStream reusableTokenStream(String fieldName, Reader reader)
109 SavedStreams streams = (SavedStreams) getPreviousTokenStream();
110 if (streams == null) {
111 streams = new SavedStreams();
112 streams.source = new StandardTokenizer(matchVersion, reader);
113 streams.result = new StandardFilter(matchVersion, streams.source);
114 // Use a special lowercase filter for turkish, the stemmer expects it.
115 if (matchVersion.onOrAfter(Version.LUCENE_31) && name.equals("Turkish"))
116 streams.result = new TurkishLowerCaseFilter(streams.result);
118 streams.result = new LowerCaseFilter(matchVersion, streams.result);
120 streams.result = new StopFilter(matchVersion,
121 streams.result, stopSet);
122 streams.result = new SnowballFilter(streams.result, name);
123 setPreviousTokenStream(streams);
125 streams.source.reset(reader);
127 return streams.result;