lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java

   1 package org.apache.lucene.analysis.query;
   2 /**
   3  * Licensed to the Apache Software Foundation (ASF) under one or more
   4  * contributor license agreements.  See the NOTICE file distributed with
   5  * this work for additional information regarding copyright ownership.
   6  * The ASF licenses this file to You under the Apache License, Version 2.0
   7  * (the "License"); you may not use this file except in compliance with
   8  * the License.  You may obtain a copy of the License at
   9  *
  10  *     http://www.apache.org/licenses/LICENSE-2.0
  11  *
  12  * Unless required by applicable law or agreed to in writing, software
  13  * distributed under the License is distributed on an "AS IS" BASIS,
  14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15  * See the License for the specific language governing permissions and
  16  * limitations under the License.
  17  */
  18
  19 import org.apache.lucene.index.IndexReader;
  20 import org.apache.lucene.index.Term;
  21 import org.apache.lucene.index.TermEnum;
  22 import org.apache.lucene.analysis.Analyzer;
  23 import org.apache.lucene.analysis.TokenStream;
  24 import org.apache.lucene.analysis.StopFilter;
  25 import org.apache.lucene.util.StringHelper;
  26 import org.apache.lucene.util.Version;
  27
  28 import java.io.IOException;
  29 import java.io.Reader;
  30 import java.util.*;
  31
  32 /**
  33  * An {@link Analyzer} used primarily at query time to wrap another analyzer and provide a layer of protection
  34  * which prevents very common words from being passed into queries.
  35  * <p>
  36  * For very large indexes the cost
  37  * of reading TermDocs for a very common word can be  high. This analyzer was created after experience with
  38  * a 38 million doc index which had a term in around 50% of docs and was causing TermQueries for
  39  * this term to take 2 seconds.
  40  * </p>
  41  * <p>
  42  * Use the various "addStopWords" methods in this class to automate the identification and addition of
  43  * stop words found in an already existing index.
  44  * </p>
  45  */
  46 public final class QueryAutoStopWordAnalyzer extends Analyzer {
  47
  48   private final Analyzer delegate;
  49   private final Map<String, Set<String>> stopWordsPerField = new HashMap<String, Set<String>>();
  50   //The default maximum percentage (40%) of index documents which
  51   //can contain a term, after which the term is considered to be a stop word.
  52   public static final float defaultMaxDocFreqPercent = 0.4f;
  53   private final Version matchVersion;
  54
  55   /**
  56    * Initializes this analyzer with the Analyzer object that actually produces the tokens
  57    *
  58    * @param delegate The choice of {@link Analyzer} that is used to produce the token stream which needs filtering
  59    * @deprecated Stopwords should be calculated at instantiation using one of the other constructors
  60    */
  61   @Deprecated
  62   public QueryAutoStopWordAnalyzer(Version matchVersion, Analyzer delegate) {
  63     this.delegate = delegate;
  64     this.matchVersion = matchVersion;
  65   }
  66
  67    /**
  68    * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all
  69    * indexed fields from terms with a document frequency percentage greater than
  70    * {@link #defaultMaxDocFreqPercent}
  71    *
  72    * @param matchVersion Version to be used in {@link StopFilter}
  73    * @param delegate Analyzer whose TokenStream will be filtered
  74    * @param indexReader IndexReader to identify the stopwords from
  75    * @throws IOException Can be thrown while reading from the IndexReader
  76    */
  77   public QueryAutoStopWordAnalyzer(
  78       Version matchVersion,
  79       Analyzer delegate,
  80       IndexReader indexReader) throws IOException {
  81     this(matchVersion, delegate, indexReader, defaultMaxDocFreqPercent);
  82   }
  83
  84   /**
  85    * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all
  86    * indexed fields from terms with a document frequency greater than the given
  87    * maxDocFreq
  88    *
  89    * @param matchVersion Version to be used in {@link StopFilter}
  90    * @param delegate Analyzer whose TokenStream will be filtered
  91    * @param indexReader IndexReader to identify the stopwords from
  92    * @param maxDocFreq Document frequency terms should be above in order to be stopwords
  93    * @throws IOException Can be thrown while reading from the IndexReader
  94    */
  95   public QueryAutoStopWordAnalyzer(
  96       Version matchVersion,
  97       Analyzer delegate,
  98       IndexReader indexReader,
  99       int maxDocFreq) throws IOException {
 100     this(matchVersion, delegate, indexReader, indexReader.getFieldNames(IndexReader.FieldOption.INDEXED), maxDocFreq);
 101   }
 102
 103   /**
 104    * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all
 105    * indexed fields from terms with a document frequency percentage greater than
 106    * the given maxPercentDocs
 107    *
 108    * @param matchVersion Version to be used in {@link StopFilter}
 109    * @param delegate Analyzer whose TokenStream will be filtered
 110    * @param indexReader IndexReader to identify the stopwords from
 111    * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
 112    *                      contain a term, after which the word is considered to be a stop word
 113    * @throws IOException Can be thrown while reading from the IndexReader
 114    */
 115   public QueryAutoStopWordAnalyzer(
 116       Version matchVersion,
 117       Analyzer delegate,
 118       IndexReader indexReader,
 119       float maxPercentDocs) throws IOException {
 120     this(matchVersion, delegate, indexReader, indexReader.getFieldNames(IndexReader.FieldOption.INDEXED), maxPercentDocs);
 121   }
 122
 123   /**
 124    * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the
 125    * given selection of fields from terms with a document frequency percentage
 126    * greater than the given maxPercentDocs
 127    *
 128    * @param matchVersion Version to be used in {@link StopFilter}
 129    * @param delegate Analyzer whose TokenStream will be filtered
 130    * @param indexReader IndexReader to identify the stopwords from
 131    * @param fields Selection of fields to calculate stopwords for
 132    * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
 133    *                      contain a term, after which the word is considered to be a stop word
 134    * @throws IOException Can be thrown while reading from the IndexReader
 135    */
 136   public QueryAutoStopWordAnalyzer(
 137       Version matchVersion,
 138       Analyzer delegate,
 139       IndexReader indexReader,
 140       Collection<String> fields,
 141       float maxPercentDocs) throws IOException {
 142     this(matchVersion, delegate, indexReader, fields, (int) (indexReader.numDocs() * maxPercentDocs));
 143   }
 144
 145   /**
 146    * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the
 147    * given selection of fields from terms with a document frequency greater than
 148    * the given maxDocFreq
 149    *
 150    * @param matchVersion Version to be used in {@link StopFilter}
 151    * @param delegate Analyzer whose TokenStream will be filtered
 152    * @param indexReader IndexReader to identify the stopwords from
 153    * @param fields Selection of fields to calculate stopwords for
 154    * @param maxDocFreq Document frequency terms should be above in order to be stopwords
 155    * @throws IOException Can be thrown while reading from the IndexReader
 156    */
 157   public QueryAutoStopWordAnalyzer(
 158       Version matchVersion,
 159       Analyzer delegate,
 160       IndexReader indexReader,
 161       Collection<String> fields,
 162       int maxDocFreq) throws IOException {
 163     this.matchVersion = matchVersion;
 164     this.delegate = delegate;
 165
 166     for (String field : fields) {
 167       Set<String> stopWords = new HashSet<String>();
 168       String internedFieldName = StringHelper.intern(field);
 169       TermEnum te = indexReader.terms(new Term(field));
 170       Term term = te.term();
 171       while (term != null) {
 172         if (term.field() != internedFieldName) {
 173           break;
 174         }
 175         if (te.docFreq() > maxDocFreq) {
 176           stopWords.add(term.text());
 177         }
 178         if (!te.next()) {
 179           break;
 180         }
 181         term = te.term();
 182       }
 183       stopWordsPerField.put(field, stopWords);
 184     }
 185   }
 186
 187   /**
 188    * Automatically adds stop words for all fields with terms exceeding the defaultMaxDocFreqPercent
 189    *
 190    * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
 191    *               exceed the required document frequency
 192    * @return The number of stop words identified.
 193    * @throws IOException
 194    * @deprecated Stopwords should be calculated at instantiation using
 195    *             {@link #QueryAutoStopWordAnalyzer(Version, Analyzer, IndexReader)}
 196    */
 197   @Deprecated
 198   public int addStopWords(IndexReader reader) throws IOException {
 199     return addStopWords(reader, defaultMaxDocFreqPercent);
 200   }
 201
 202   /**
 203    * Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
 204    *
 205    * @param reader     The {@link IndexReader} which will be consulted to identify potential stop words that
 206    *                   exceed the required document frequency
 207    * @param maxDocFreq The maximum number of index documents which can contain a term, after which
 208    *                   the term is considered to be a stop word
 209    * @return The number of stop words identified.
 210    * @throws IOException
 211    * @deprecated Stopwords should be calculated at instantiation using
 212    *             {@link #QueryAutoStopWordAnalyzer(Version, Analyzer, IndexReader, int)}
 213    */
 214   @Deprecated
 215   public int addStopWords(IndexReader reader, int maxDocFreq) throws IOException {
 216     int numStopWords = 0;
 217     Collection<String> fieldNames = reader.getFieldNames(IndexReader.FieldOption.INDEXED);
 218     for (Iterator<String> iter = fieldNames.iterator(); iter.hasNext();) {
 219       String fieldName = iter.next();
 220       numStopWords += addStopWords(reader, fieldName, maxDocFreq);
 221     }
 222     return numStopWords;
 223   }
 224
 225   /**
 226    * Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
 227    *
 228    * @param reader        The {@link IndexReader} which will be consulted to identify potential stop words that
 229    *                      exceed the required document frequency
 230    * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
 231    *                      contain a term, after which the word is considered to be a stop word.
 232    * @return The number of stop words identified.
 233    * @throws IOException
 234    * @deprecated Stowords should be calculated at instantiation using
 235    *             {@link #QueryAutoStopWordAnalyzer(Version, Analyzer, IndexReader, float)}
 236    */
 237   @Deprecated
 238   public int addStopWords(IndexReader reader, float maxPercentDocs) throws IOException {
 239     int numStopWords = 0;
 240     Collection<String> fieldNames = reader.getFieldNames(IndexReader.FieldOption.INDEXED);
 241     for (Iterator<String> iter = fieldNames.iterator(); iter.hasNext();) {
 242       String fieldName = iter.next();
 243       numStopWords += addStopWords(reader, fieldName, maxPercentDocs);
 244     }
 245     return numStopWords;
 246   }
 247
 248   /**
 249    * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
 250    *
 251    * @param reader         The {@link IndexReader} which will be consulted to identify potential stop words that
 252    *                       exceed the required document frequency
 253    * @param fieldName      The field for which stopwords will be added
 254    * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
 255    *                       contain a term, after which the word is considered to be a stop word.
 256    * @return The number of stop words identified.
 257    * @throws IOException
 258    * @deprecated Stowords should be calculated at instantiation using
 259    *             {@link #QueryAutoStopWordAnalyzer(Version, Analyzer, IndexReader, Collection, float)}
 260    */
 261   @Deprecated
 262   public int addStopWords(IndexReader reader, String fieldName, float maxPercentDocs) throws IOException {
 263     return addStopWords(reader, fieldName, (int) (reader.numDocs() * maxPercentDocs));
 264   }
 265
 266   /**
 267    * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
 268    *
 269    * @param reader     The {@link IndexReader} which will be consulted to identify potential stop words that
 270    *                   exceed the required document frequency
 271    * @param fieldName  The field for which stopwords will be added
 272    * @param maxDocFreq The maximum number of index documents which
 273    *                   can contain a term, after which the term is considered to be a stop word.
 274    * @return The number of stop words identified.
 275    * @throws IOException
 276    * @deprecated Stowords should be calculated at instantiation using
 277    *             {@link #QueryAutoStopWordAnalyzer(Version, Analyzer, IndexReader, Collection, int)}
 278    */
 279   @Deprecated
 280   public int addStopWords(IndexReader reader, String fieldName, int maxDocFreq) throws IOException {
 281     HashSet<String> stopWords = new HashSet<String>();
 282     String internedFieldName = StringHelper.intern(fieldName);
 283     TermEnum te = reader.terms(new Term(fieldName));
 284     Term term = te.term();
 285     while (term != null) {
 286       if (term.field() != internedFieldName) {
 287         break;
 288       }
 289       if (te.docFreq() > maxDocFreq) {
 290         stopWords.add(term.text());
 291       }
 292       if (!te.next()) {
 293         break;
 294       }
 295       term = te.term();
 296     }
 297     stopWordsPerField.put(fieldName, stopWords);
 298
 299     /* if the stopwords for a field are changed,
 300      * then saved streams for that field are erased.
 301      */
 302     @SuppressWarnings("unchecked")
 303     Map<String,SavedStreams> streamMap = (Map<String,SavedStreams>) getPreviousTokenStream();
 304     if (streamMap != null)
 305       streamMap.remove(fieldName);
 306
 307     return stopWords.size();
 308   }
 309
 310   @Override
 311   public TokenStream tokenStream(String fieldName, Reader reader) {
 312     TokenStream result;
 313     try {
 314       result = delegate.reusableTokenStream(fieldName, reader);
 315     } catch (IOException e) {
 316       result = delegate.tokenStream(fieldName, reader);
 317     }
 318     Set<String> stopWords = stopWordsPerField.get(fieldName);
 319     if (stopWords != null) {
 320       result = new StopFilter(matchVersion, result, stopWords);
 321     }
 322     return result;
 323   }
 324
 325   private class SavedStreams {
 326     /* the underlying stream */
 327     TokenStream wrapped;
 328
 329     /*
 330      * when there are no stopwords for the field, refers to wrapped.
 331      * if there stopwords, it is a StopFilter around wrapped.
 332      */
 333     TokenStream withStopFilter;
 334   }
 335
 336   @SuppressWarnings("unchecked")
 337   @Override
 338   public TokenStream reusableTokenStream(String fieldName, Reader reader)
 339       throws IOException {
 340     /* map of SavedStreams for each field */
 341     Map<String,SavedStreams> streamMap = (Map<String,SavedStreams>) getPreviousTokenStream();
 342     if (streamMap == null) {
 343       streamMap = new HashMap<String, SavedStreams>();
 344       setPreviousTokenStream(streamMap);
 345     }
 346
 347     SavedStreams streams = streamMap.get(fieldName);
 348     if (streams == null) {
 349       /* an entry for this field does not exist, create one */
 350       streams = new SavedStreams();
 351       streamMap.put(fieldName, streams);
 352       streams.wrapped = delegate.reusableTokenStream(fieldName, reader);
 353
 354       /* if there are any stopwords for the field, save the stopfilter */
 355       Set<String> stopWords = stopWordsPerField.get(fieldName);
 356       if (stopWords != null) {
 357         streams.withStopFilter = new StopFilter(matchVersion, streams.wrapped, stopWords);
 358       } else {
 359         streams.withStopFilter = streams.wrapped;
 360       }
 361
 362     } else {
 363       /*
 364        * an entry for this field exists, verify the wrapped stream has not
 365        * changed. if it has not, reuse it, otherwise wrap the new stream.
 366        */
 367       TokenStream result = delegate.reusableTokenStream(fieldName, reader);
 368       if (result == streams.wrapped) {
 369         /* the wrapped analyzer reused the stream */
 370       } else {
 371         /*
 372          * the wrapped analyzer did not. if there are any stopwords for the
 373          * field, create a new StopFilter around the new stream
 374          */
 375         streams.wrapped = result;
 376         Set<String> stopWords = stopWordsPerField.get(fieldName);
 377         if (stopWords != null) {
 378           streams.withStopFilter = new StopFilter(matchVersion, streams.wrapped, stopWords);
 379         } else {
 380           streams.withStopFilter = streams.wrapped;
 381         }
 382       }
 383     }
 384
 385     return streams.withStopFilter;
 386   }
 387
 388   /**
 389    * Provides information on which stop words have been identified for a field
 390    *
 391    * @param fieldName The field for which stop words identified in "addStopWords"
 392    *                  method calls will be returned
 393    * @return the stop words identified for a field
 394    */
 395   public String[] getStopWords(String fieldName) {
 396     Set<String> stopWords = stopWordsPerField.get(fieldName);
 397     return stopWords != null ? stopWords.toArray(new String[stopWords.size()]) : new String[0];
 398   }
 399
 400   /**
 401    * Provides information on which stop words have been identified for all fields
 402    *
 403    * @return the stop words (as terms)
 404    */
 405   public Term[] getStopWords() {
 406     List<Term> allStopWords = new ArrayList<Term>();
 407     for (String fieldName : stopWordsPerField.keySet()) {
 408       Set<String> stopWords = stopWordsPerField.get(fieldName);
 409       for (String text : stopWords) {
 410         allStopWords.add(new Term(fieldName, text));
 411       }
 412     }
 413     return allStopWords.toArray(new Term[allStopWords.size()]);
 414         }
 415
 416 }