1 package org.apache.lucene.analysis.query;
3 * Licensed to the Apache Software Foundation (ASF) under one or more
4 * contributor license agreements. See the NOTICE file distributed with
5 * this work for additional information regarding copyright ownership.
6 * The ASF licenses this file to You under the Apache License, Version 2.0
7 * (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
19 import org.apache.lucene.index.IndexReader;
20 import org.apache.lucene.index.Term;
21 import org.apache.lucene.index.TermEnum;
22 import org.apache.lucene.analysis.Analyzer;
23 import org.apache.lucene.analysis.TokenStream;
24 import org.apache.lucene.analysis.StopFilter;
25 import org.apache.lucene.util.StringHelper;
26 import org.apache.lucene.util.Version;
28 import java.io.IOException;
29 import java.io.Reader;
33 * An {@link Analyzer} used primarily at query time to wrap another analyzer and provide a layer of protection
34 * which prevents very common words from being passed into queries.
36 * For very large indexes the cost
37 * of reading TermDocs for a very common word can be high. This analyzer was created after experience with
38 * a 38 million doc index which had a term in around 50% of docs and was causing TermQueries for
39 * this term to take 2 seconds.
42 * Use the various "addStopWords" methods in this class to automate the identification and addition of
43 * stop words found in an already existing index.
46 public final class QueryAutoStopWordAnalyzer extends Analyzer {
48 HashMap<String,HashSet<String>> stopWordsPerField = new HashMap<String,HashSet<String>>();
49 //The default maximum percentage (40%) of index documents which
50 //can contain a term, after which the term is considered to be a stop word.
51 public static final float defaultMaxDocFreqPercent = 0.4f;
52 private final Version matchVersion;
55 * Initializes this analyzer with the Analyzer object that actually produces the tokens
57 * @param delegate The choice of {@link Analyzer} that is used to produce the token stream which needs filtering
59 public QueryAutoStopWordAnalyzer(Version matchVersion, Analyzer delegate) {
60 this.delegate = delegate;
61 this.matchVersion = matchVersion;
65 * Automatically adds stop words for all fields with terms exceeding the defaultMaxDocFreqPercent
67 * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
68 * exceed the required document frequency
69 * @return The number of stop words identified.
72 public int addStopWords(IndexReader reader) throws IOException {
73 return addStopWords(reader, defaultMaxDocFreqPercent);
77 * Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
79 * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
80 * exceed the required document frequency
81 * @param maxDocFreq The maximum number of index documents which can contain a term, after which
82 * the term is considered to be a stop word
83 * @return The number of stop words identified.
86 public int addStopWords(IndexReader reader, int maxDocFreq) throws IOException {
88 Collection<String> fieldNames = reader.getFieldNames(IndexReader.FieldOption.INDEXED);
89 for (Iterator<String> iter = fieldNames.iterator(); iter.hasNext();) {
90 String fieldName = iter.next();
91 numStopWords += addStopWords(reader, fieldName, maxDocFreq);
97 * Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
99 * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
100 * exceed the required document frequency
101 * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
102 * contain a term, after which the word is considered to be a stop word.
103 * @return The number of stop words identified.
104 * @throws IOException
106 public int addStopWords(IndexReader reader, float maxPercentDocs) throws IOException {
107 int numStopWords = 0;
108 Collection<String> fieldNames = reader.getFieldNames(IndexReader.FieldOption.INDEXED);
109 for (Iterator<String> iter = fieldNames.iterator(); iter.hasNext();) {
110 String fieldName = iter.next();
111 numStopWords += addStopWords(reader, fieldName, maxPercentDocs);
117 * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
119 * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
120 * exceed the required document frequency
121 * @param fieldName The field for which stopwords will be added
122 * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
123 * contain a term, after which the word is considered to be a stop word.
124 * @return The number of stop words identified.
125 * @throws IOException
127 public int addStopWords(IndexReader reader, String fieldName, float maxPercentDocs) throws IOException {
128 return addStopWords(reader, fieldName, (int) (reader.numDocs() * maxPercentDocs));
132 * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
134 * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
135 * exceed the required document frequency
136 * @param fieldName The field for which stopwords will be added
137 * @param maxDocFreq The maximum number of index documents which
138 * can contain a term, after which the term is considered to be a stop word.
139 * @return The number of stop words identified.
140 * @throws IOException
142 public int addStopWords(IndexReader reader, String fieldName, int maxDocFreq) throws IOException {
143 HashSet<String> stopWords = new HashSet<String>();
144 String internedFieldName = StringHelper.intern(fieldName);
145 TermEnum te = reader.terms(new Term(fieldName));
146 Term term = te.term();
147 while (term != null) {
148 if (term.field() != internedFieldName) {
151 if (te.docFreq() > maxDocFreq) {
152 stopWords.add(term.text());
159 stopWordsPerField.put(fieldName, stopWords);
161 /* if the stopwords for a field are changed,
162 * then saved streams for that field are erased.
164 @SuppressWarnings("unchecked")
165 Map<String,SavedStreams> streamMap = (Map<String,SavedStreams>) getPreviousTokenStream();
166 if (streamMap != null)
167 streamMap.remove(fieldName);
169 return stopWords.size();
173 public TokenStream tokenStream(String fieldName, Reader reader) {
176 result = delegate.reusableTokenStream(fieldName, reader);
177 } catch (IOException e) {
178 result = delegate.tokenStream(fieldName, reader);
180 HashSet<String> stopWords = stopWordsPerField.get(fieldName);
181 if (stopWords != null) {
182 result = new StopFilter(matchVersion, result, stopWords);
187 private class SavedStreams {
188 /* the underlying stream */
192 * when there are no stopwords for the field, refers to wrapped.
193 * if there stopwords, it is a StopFilter around wrapped.
195 TokenStream withStopFilter;
199 public TokenStream reusableTokenStream(String fieldName, Reader reader)
201 /* map of SavedStreams for each field */
202 @SuppressWarnings("unchecked")
203 Map<String,SavedStreams> streamMap = (Map<String,SavedStreams>) getPreviousTokenStream();
204 if (streamMap == null) {
205 streamMap = new HashMap<String, SavedStreams>();
206 setPreviousTokenStream(streamMap);
209 SavedStreams streams = streamMap.get(fieldName);
210 if (streams == null) {
211 /* an entry for this field does not exist, create one */
212 streams = new SavedStreams();
213 streamMap.put(fieldName, streams);
214 streams.wrapped = delegate.reusableTokenStream(fieldName, reader);
216 /* if there are any stopwords for the field, save the stopfilter */
217 HashSet<String> stopWords = stopWordsPerField.get(fieldName);
218 if (stopWords != null)
219 streams.withStopFilter = new StopFilter(matchVersion, streams.wrapped, stopWords);
221 streams.withStopFilter = streams.wrapped;
225 * an entry for this field exists, verify the wrapped stream has not
226 * changed. if it has not, reuse it, otherwise wrap the new stream.
228 TokenStream result = delegate.reusableTokenStream(fieldName, reader);
229 if (result == streams.wrapped) {
230 /* the wrapped analyzer reused the stream */
233 * the wrapped analyzer did not. if there are any stopwords for the
234 * field, create a new StopFilter around the new stream
236 streams.wrapped = result;
237 HashSet<String> stopWords = stopWordsPerField.get(fieldName);
238 if (stopWords != null)
239 streams.withStopFilter = new StopFilter(matchVersion, streams.wrapped, stopWords);
241 streams.withStopFilter = streams.wrapped;
245 return streams.withStopFilter;
249 * Provides information on which stop words have been identified for a field
251 * @param fieldName The field for which stop words identified in "addStopWords"
252 * method calls will be returned
253 * @return the stop words identified for a field
255 public String[] getStopWords(String fieldName) {
257 HashSet<String> stopWords = stopWordsPerField.get(fieldName);
258 if (stopWords != null) {
259 result = stopWords.toArray(new String[stopWords.size()]);
261 result = new String[0];
267 * Provides information on which stop words have been identified for all fields
269 * @return the stop words (as terms)
271 public Term[] getStopWords() {
272 ArrayList<Term> allStopWords = new ArrayList<Term>();
273 for (Iterator<String> iter = stopWordsPerField.keySet().iterator(); iter.hasNext();) {
274 String fieldName = iter.next();
275 HashSet<String> stopWords = stopWordsPerField.get(fieldName);
276 for (Iterator<String> iterator = stopWords.iterator(); iterator.hasNext();) {
277 String text = iterator.next();
278 allStopWords.add(new Term(fieldName, text));
281 return allStopWords.toArray(new Term[allStopWords.size()]);