1 package org.apache.lucene.analysis;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.util.Arrays;
23 import java.util.List;
25 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
26 import org.apache.lucene.queryParser.QueryParser; // for javadoc
27 import org.apache.lucene.util.Version;
30 * Removes stop words from a token stream.
33 * <p>You must specify the required {@link Version}
34 * compatibility when creating StopFilter:
36 * <li> As of 3.1, StopFilter correctly handles Unicode 4.0
37 * supplementary characters in stopwords and position
38 * increments are preserved
41 public final class StopFilter extends FilteringTokenFilter {
43 private final CharArraySet stopWords;
44 private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
47 * Construct a token stream filtering the given input.
48 * If <code>stopWords</code> is an instance of {@link CharArraySet} (true if
49 * <code>makeStopSet()</code> was used to construct the set) it will be directly used
50 * and <code>ignoreCase</code> will be ignored since <code>CharArraySet</code>
51 * directly controls case sensitivity.
53 * If <code>stopWords</code> is not an instance of {@link CharArraySet},
54 * a new CharArraySet will be constructed and <code>ignoreCase</code> will be
55 * used to specify the case sensitivity of that set.
57 * @param enablePositionIncrements true if token positions should record the removed stop words
58 * @param input Input TokenStream
59 * @param stopWords A Set of Strings or char[] or any other toString()-able set representing the stopwords
60 * @param ignoreCase if true, all words are lower cased first
61 * @deprecated use {@link #StopFilter(Version, TokenStream, Set, boolean)} instead
64 public StopFilter(boolean enablePositionIncrements, TokenStream input, Set<?> stopWords, boolean ignoreCase)
66 this(Version.LUCENE_30, enablePositionIncrements, input, stopWords, ignoreCase);
70 * Construct a token stream filtering the given input. If
71 * <code>stopWords</code> is an instance of {@link CharArraySet} (true if
72 * <code>makeStopSet()</code> was used to construct the set) it will be
73 * directly used and <code>ignoreCase</code> will be ignored since
74 * <code>CharArraySet</code> directly controls case sensitivity.
76 * If <code>stopWords</code> is not an instance of {@link CharArraySet}, a new
77 * CharArraySet will be constructed and <code>ignoreCase</code> will be used
78 * to specify the case sensitivity of that set.
81 * Lucene version to enable correct Unicode 4.0 behavior in the stop
82 * set if Version > 3.0. See <a href="#version">above</a> for details.
86 * A Set of Strings or char[] or any other toString()-able set
87 * representing the stopwords
89 * if true, all words are lower cased first
91 public StopFilter(Version matchVersion, TokenStream input, Set<?> stopWords, boolean ignoreCase)
93 this(matchVersion, matchVersion.onOrAfter(Version.LUCENE_29), input, stopWords, ignoreCase);
97 * convenience ctor to enable deprecated ctors to set posInc explicitly
99 private StopFilter(Version matchVersion, boolean enablePositionIncrements, TokenStream input, Set<?> stopWords, boolean ignoreCase){
100 super(enablePositionIncrements, input);
101 this.stopWords = stopWords instanceof CharArraySet ? (CharArraySet)stopWords : new CharArraySet(matchVersion, stopWords, ignoreCase);
105 * Constructs a filter which removes words from the input
106 * TokenStream that are named in the Set.
108 * @param enablePositionIncrements true if token positions should record the removed stop words
109 * @param in Input stream
110 * @param stopWords A Set of Strings or char[] or any other toString()-able set representing the stopwords
111 * @see #makeStopSet(Version, java.lang.String[])
112 * @deprecated use {@link #StopFilter(Version, TokenStream, Set)} instead
115 public StopFilter(boolean enablePositionIncrements, TokenStream in, Set<?> stopWords) {
116 this(Version.LUCENE_30, enablePositionIncrements, in, stopWords, false);
120 * Constructs a filter which removes words from the input TokenStream that are
123 * @param matchVersion
124 * Lucene version to enable correct Unicode 4.0 behavior in the stop
125 * set if Version > 3.0. See <a href="#version">above</a> for details.
129 * A Set of Strings or char[] or any other toString()-able set
130 * representing the stopwords
131 * @see #makeStopSet(Version, java.lang.String[])
133 public StopFilter(Version matchVersion, TokenStream in, Set<?> stopWords) {
134 this(matchVersion, in, stopWords, false);
138 * Builds a Set from an array of stop words,
139 * appropriate for passing into the StopFilter constructor.
140 * This permits this stopWords construction to be cached once when
141 * an Analyzer is constructed.
143 * @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
144 * @deprecated use {@link #makeStopSet(Version, String...)} instead
147 public static final Set<Object> makeStopSet(String... stopWords) {
148 return makeStopSet(Version.LUCENE_30, stopWords, false);
152 * Builds a Set from an array of stop words,
153 * appropriate for passing into the StopFilter constructor.
154 * This permits this stopWords construction to be cached once when
155 * an Analyzer is constructed.
157 * @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
158 * @param stopWords An array of stopwords
159 * @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
161 public static final Set<Object> makeStopSet(Version matchVersion, String... stopWords) {
162 return makeStopSet(matchVersion, stopWords, false);
166 * Builds a Set from an array of stop words,
167 * appropriate for passing into the StopFilter constructor.
168 * This permits this stopWords construction to be cached once when
169 * an Analyzer is constructed.
170 * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
171 * @return A Set ({@link CharArraySet}) containing the words
172 * @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
173 * @deprecated use {@link #makeStopSet(Version, List)} instead
176 public static final Set<Object> makeStopSet(List<?> stopWords) {
177 return makeStopSet(Version.LUCENE_30, stopWords, false);
181 * Builds a Set from an array of stop words,
182 * appropriate for passing into the StopFilter constructor.
183 * This permits this stopWords construction to be cached once when
184 * an Analyzer is constructed.
186 * @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
187 * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
188 * @return A Set ({@link CharArraySet}) containing the words
189 * @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
191 public static final Set<Object> makeStopSet(Version matchVersion, List<?> stopWords) {
192 return makeStopSet(matchVersion, stopWords, false);
196 * Creates a stopword set from the given stopword array.
197 * @param stopWords An array of stopwords
198 * @param ignoreCase If true, all words are lower cased first.
199 * @return a Set containing the words
200 * @deprecated use {@link #makeStopSet(Version, String[], boolean)} instead;
203 public static final Set<Object> makeStopSet(String[] stopWords, boolean ignoreCase) {
204 return makeStopSet(Version.LUCENE_30, stopWords, ignoreCase);
207 * Creates a stopword set from the given stopword array.
209 * @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
210 * @param stopWords An array of stopwords
211 * @param ignoreCase If true, all words are lower cased first.
212 * @return a Set containing the words
214 public static final Set<Object> makeStopSet(Version matchVersion, String[] stopWords, boolean ignoreCase) {
215 CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.length, ignoreCase);
216 stopSet.addAll(Arrays.asList(stopWords));
221 * Creates a stopword set from the given stopword list.
222 * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
223 * @param ignoreCase if true, all words are lower cased first
224 * @return A Set ({@link CharArraySet}) containing the words
225 * @deprecated use {@link #makeStopSet(Version, List, boolean)} instead
228 public static final Set<Object> makeStopSet(List<?> stopWords, boolean ignoreCase){
229 return makeStopSet(Version.LUCENE_30, stopWords, ignoreCase);
233 * Creates a stopword set from the given stopword list.
234 * @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
235 * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
236 * @param ignoreCase if true, all words are lower cased first
237 * @return A Set ({@link CharArraySet}) containing the words
239 public static final Set<Object> makeStopSet(Version matchVersion, List<?> stopWords, boolean ignoreCase){
240 CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.size(), ignoreCase);
241 stopSet.addAll(stopWords);
246 * Returns the next input Token whose term() is not a stop word.
249 protected boolean accept() throws IOException {
250 return !stopWords.contains(termAtt.buffer(), 0, termAtt.length());
254 * Returns version-dependent default for
255 * enablePositionIncrements. Analyzers that embed
256 * StopFilter use this method when creating the
257 * StopFilter. Prior to 2.9, this returns false. On 2.9
258 * or later, it returns true.
259 * @deprecated use {@link #StopFilter(Version, TokenStream, Set)} instead
262 public static boolean getEnablePositionIncrementsVersionDefault(Version matchVersion) {
263 return matchVersion.onOrAfter(Version.LUCENE_29);