1 package org.apache.lucene.analysis.query;
3 * Licensed to the Apache Software Foundation (ASF) under one or more
4 * contributor license agreements. See the NOTICE file distributed with
5 * this work for additional information regarding copyright ownership.
6 * The ASF licenses this file to You under the Apache License, Version 2.0
7 * (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
19 import org.apache.lucene.index.IndexReader;
20 import org.apache.lucene.index.Term;
21 import org.apache.lucene.index.TermEnum;
22 import org.apache.lucene.analysis.Analyzer;
23 import org.apache.lucene.analysis.TokenStream;
24 import org.apache.lucene.analysis.StopFilter;
25 import org.apache.lucene.util.StringHelper;
26 import org.apache.lucene.util.Version;
28 import java.io.IOException;
29 import java.io.Reader;
33 * An {@link Analyzer} used primarily at query time to wrap another analyzer and provide a layer of protection
34 * which prevents very common words from being passed into queries.
36 * For very large indexes the cost
37 * of reading TermDocs for a very common word can be high. This analyzer was created after experience with
38 * a 38 million doc index which had a term in around 50% of docs and was causing TermQueries for
39 * this term to take 2 seconds.
42 * Use the various "addStopWords" methods in this class to automate the identification and addition of
43 * stop words found in an already existing index.
46 public final class QueryAutoStopWordAnalyzer extends Analyzer {
48 private final Analyzer delegate;
49 private final Map<String, Set<String>> stopWordsPerField = new HashMap<String, Set<String>>();
50 //The default maximum percentage (40%) of index documents which
51 //can contain a term, after which the term is considered to be a stop word.
52 public static final float defaultMaxDocFreqPercent = 0.4f;
53 private final Version matchVersion;
56 * Initializes this analyzer with the Analyzer object that actually produces the tokens
58 * @param delegate The choice of {@link Analyzer} that is used to produce the token stream which needs filtering
59 * @deprecated Stopwords should be calculated at instantiation using one of the other constructors
62 public QueryAutoStopWordAnalyzer(Version matchVersion, Analyzer delegate) {
63 this.delegate = delegate;
64 this.matchVersion = matchVersion;
68 * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all
69 * indexed fields from terms with a document frequency percentage greater than
70 * {@link #defaultMaxDocFreqPercent}
72 * @param matchVersion Version to be used in {@link StopFilter}
73 * @param delegate Analyzer whose TokenStream will be filtered
74 * @param indexReader IndexReader to identify the stopwords from
75 * @throws IOException Can be thrown while reading from the IndexReader
77 public QueryAutoStopWordAnalyzer(
80 IndexReader indexReader) throws IOException {
81 this(matchVersion, delegate, indexReader, defaultMaxDocFreqPercent);
85 * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all
86 * indexed fields from terms with a document frequency greater than the given
89 * @param matchVersion Version to be used in {@link StopFilter}
90 * @param delegate Analyzer whose TokenStream will be filtered
91 * @param indexReader IndexReader to identify the stopwords from
92 * @param maxDocFreq Document frequency terms should be above in order to be stopwords
93 * @throws IOException Can be thrown while reading from the IndexReader
95 public QueryAutoStopWordAnalyzer(
98 IndexReader indexReader,
99 int maxDocFreq) throws IOException {
100 this(matchVersion, delegate, indexReader, indexReader.getFieldNames(IndexReader.FieldOption.INDEXED), maxDocFreq);
104 * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all
105 * indexed fields from terms with a document frequency percentage greater than
106 * the given maxPercentDocs
108 * @param matchVersion Version to be used in {@link StopFilter}
109 * @param delegate Analyzer whose TokenStream will be filtered
110 * @param indexReader IndexReader to identify the stopwords from
111 * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
112 * contain a term, after which the word is considered to be a stop word
113 * @throws IOException Can be thrown while reading from the IndexReader
115 public QueryAutoStopWordAnalyzer(
116 Version matchVersion,
118 IndexReader indexReader,
119 float maxPercentDocs) throws IOException {
120 this(matchVersion, delegate, indexReader, indexReader.getFieldNames(IndexReader.FieldOption.INDEXED), maxPercentDocs);
124 * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the
125 * given selection of fields from terms with a document frequency percentage
126 * greater than the given maxPercentDocs
128 * @param matchVersion Version to be used in {@link StopFilter}
129 * @param delegate Analyzer whose TokenStream will be filtered
130 * @param indexReader IndexReader to identify the stopwords from
131 * @param fields Selection of fields to calculate stopwords for
132 * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
133 * contain a term, after which the word is considered to be a stop word
134 * @throws IOException Can be thrown while reading from the IndexReader
136 public QueryAutoStopWordAnalyzer(
137 Version matchVersion,
139 IndexReader indexReader,
140 Collection<String> fields,
141 float maxPercentDocs) throws IOException {
142 this(matchVersion, delegate, indexReader, fields, (int) (indexReader.numDocs() * maxPercentDocs));
146 * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the
147 * given selection of fields from terms with a document frequency greater than
148 * the given maxDocFreq
150 * @param matchVersion Version to be used in {@link StopFilter}
151 * @param delegate Analyzer whose TokenStream will be filtered
152 * @param indexReader IndexReader to identify the stopwords from
153 * @param fields Selection of fields to calculate stopwords for
154 * @param maxDocFreq Document frequency terms should be above in order to be stopwords
155 * @throws IOException Can be thrown while reading from the IndexReader
157 public QueryAutoStopWordAnalyzer(
158 Version matchVersion,
160 IndexReader indexReader,
161 Collection<String> fields,
162 int maxDocFreq) throws IOException {
163 this.matchVersion = matchVersion;
164 this.delegate = delegate;
166 for (String field : fields) {
167 Set<String> stopWords = new HashSet<String>();
168 String internedFieldName = StringHelper.intern(field);
169 TermEnum te = indexReader.terms(new Term(field));
170 Term term = te.term();
171 while (term != null) {
172 if (term.field() != internedFieldName) {
175 if (te.docFreq() > maxDocFreq) {
176 stopWords.add(term.text());
183 stopWordsPerField.put(field, stopWords);
188 * Automatically adds stop words for all fields with terms exceeding the defaultMaxDocFreqPercent
190 * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
191 * exceed the required document frequency
192 * @return The number of stop words identified.
193 * @throws IOException
194 * @deprecated Stopwords should be calculated at instantiation using
195 * {@link #QueryAutoStopWordAnalyzer(Version, Analyzer, IndexReader)}
198 public int addStopWords(IndexReader reader) throws IOException {
199 return addStopWords(reader, defaultMaxDocFreqPercent);
203 * Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
205 * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
206 * exceed the required document frequency
207 * @param maxDocFreq The maximum number of index documents which can contain a term, after which
208 * the term is considered to be a stop word
209 * @return The number of stop words identified.
210 * @throws IOException
211 * @deprecated Stopwords should be calculated at instantiation using
212 * {@link #QueryAutoStopWordAnalyzer(Version, Analyzer, IndexReader, int)}
215 public int addStopWords(IndexReader reader, int maxDocFreq) throws IOException {
216 int numStopWords = 0;
217 Collection<String> fieldNames = reader.getFieldNames(IndexReader.FieldOption.INDEXED);
218 for (Iterator<String> iter = fieldNames.iterator(); iter.hasNext();) {
219 String fieldName = iter.next();
220 numStopWords += addStopWords(reader, fieldName, maxDocFreq);
226 * Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
228 * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
229 * exceed the required document frequency
230 * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
231 * contain a term, after which the word is considered to be a stop word.
232 * @return The number of stop words identified.
233 * @throws IOException
234 * @deprecated Stowords should be calculated at instantiation using
235 * {@link #QueryAutoStopWordAnalyzer(Version, Analyzer, IndexReader, float)}
238 public int addStopWords(IndexReader reader, float maxPercentDocs) throws IOException {
239 int numStopWords = 0;
240 Collection<String> fieldNames = reader.getFieldNames(IndexReader.FieldOption.INDEXED);
241 for (Iterator<String> iter = fieldNames.iterator(); iter.hasNext();) {
242 String fieldName = iter.next();
243 numStopWords += addStopWords(reader, fieldName, maxPercentDocs);
249 * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
251 * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
252 * exceed the required document frequency
253 * @param fieldName The field for which stopwords will be added
254 * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
255 * contain a term, after which the word is considered to be a stop word.
256 * @return The number of stop words identified.
257 * @throws IOException
258 * @deprecated Stowords should be calculated at instantiation using
259 * {@link #QueryAutoStopWordAnalyzer(Version, Analyzer, IndexReader, Collection, float)}
262 public int addStopWords(IndexReader reader, String fieldName, float maxPercentDocs) throws IOException {
263 return addStopWords(reader, fieldName, (int) (reader.numDocs() * maxPercentDocs));
267 * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
269 * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
270 * exceed the required document frequency
271 * @param fieldName The field for which stopwords will be added
272 * @param maxDocFreq The maximum number of index documents which
273 * can contain a term, after which the term is considered to be a stop word.
274 * @return The number of stop words identified.
275 * @throws IOException
276 * @deprecated Stowords should be calculated at instantiation using
277 * {@link #QueryAutoStopWordAnalyzer(Version, Analyzer, IndexReader, Collection, int)}
280 public int addStopWords(IndexReader reader, String fieldName, int maxDocFreq) throws IOException {
281 HashSet<String> stopWords = new HashSet<String>();
282 String internedFieldName = StringHelper.intern(fieldName);
283 TermEnum te = reader.terms(new Term(fieldName));
284 Term term = te.term();
285 while (term != null) {
286 if (term.field() != internedFieldName) {
289 if (te.docFreq() > maxDocFreq) {
290 stopWords.add(term.text());
297 stopWordsPerField.put(fieldName, stopWords);
299 /* if the stopwords for a field are changed,
300 * then saved streams for that field are erased.
302 @SuppressWarnings("unchecked")
303 Map<String,SavedStreams> streamMap = (Map<String,SavedStreams>) getPreviousTokenStream();
304 if (streamMap != null)
305 streamMap.remove(fieldName);
307 return stopWords.size();
311 public TokenStream tokenStream(String fieldName, Reader reader) {
314 result = delegate.reusableTokenStream(fieldName, reader);
315 } catch (IOException e) {
316 result = delegate.tokenStream(fieldName, reader);
318 Set<String> stopWords = stopWordsPerField.get(fieldName);
319 if (stopWords != null) {
320 result = new StopFilter(matchVersion, result, stopWords);
325 private class SavedStreams {
326 /* the underlying stream */
330 * when there are no stopwords for the field, refers to wrapped.
331 * if there stopwords, it is a StopFilter around wrapped.
333 TokenStream withStopFilter;
336 @SuppressWarnings("unchecked")
338 public TokenStream reusableTokenStream(String fieldName, Reader reader)
340 /* map of SavedStreams for each field */
341 Map<String,SavedStreams> streamMap = (Map<String,SavedStreams>) getPreviousTokenStream();
342 if (streamMap == null) {
343 streamMap = new HashMap<String, SavedStreams>();
344 setPreviousTokenStream(streamMap);
347 SavedStreams streams = streamMap.get(fieldName);
348 if (streams == null) {
349 /* an entry for this field does not exist, create one */
350 streams = new SavedStreams();
351 streamMap.put(fieldName, streams);
352 streams.wrapped = delegate.reusableTokenStream(fieldName, reader);
354 /* if there are any stopwords for the field, save the stopfilter */
355 Set<String> stopWords = stopWordsPerField.get(fieldName);
356 if (stopWords != null) {
357 streams.withStopFilter = new StopFilter(matchVersion, streams.wrapped, stopWords);
359 streams.withStopFilter = streams.wrapped;
364 * an entry for this field exists, verify the wrapped stream has not
365 * changed. if it has not, reuse it, otherwise wrap the new stream.
367 TokenStream result = delegate.reusableTokenStream(fieldName, reader);
368 if (result == streams.wrapped) {
369 /* the wrapped analyzer reused the stream */
372 * the wrapped analyzer did not. if there are any stopwords for the
373 * field, create a new StopFilter around the new stream
375 streams.wrapped = result;
376 Set<String> stopWords = stopWordsPerField.get(fieldName);
377 if (stopWords != null) {
378 streams.withStopFilter = new StopFilter(matchVersion, streams.wrapped, stopWords);
380 streams.withStopFilter = streams.wrapped;
385 return streams.withStopFilter;
389 * Provides information on which stop words have been identified for a field
391 * @param fieldName The field for which stop words identified in "addStopWords"
392 * method calls will be returned
393 * @return the stop words identified for a field
395 public String[] getStopWords(String fieldName) {
396 Set<String> stopWords = stopWordsPerField.get(fieldName);
397 return stopWords != null ? stopWords.toArray(new String[stopWords.size()]) : new String[0];
401 * Provides information on which stop words have been identified for all fields
403 * @return the stop words (as terms)
405 public Term[] getStopWords() {
406 List<Term> allStopWords = new ArrayList<Term>();
407 for (String fieldName : stopWordsPerField.keySet()) {
408 Set<String> stopWords = stopWordsPerField.get(fieldName);
409 for (String text : stopWords) {
410 allStopWords.add(new Term(fieldName, text));
413 return allStopWords.toArray(new Term[allStopWords.size()]);