lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java

   1 package org.apache.lucene.analysis.query;
   2 /**
   3  * Licensed to the Apache Software Foundation (ASF) under one or more
   4  * contributor license agreements.  See the NOTICE file distributed with
   5  * this work for additional information regarding copyright ownership.
   6  * The ASF licenses this file to You under the Apache License, Version 2.0
   7  * (the "License"); you may not use this file except in compliance with
   8  * the License.  You may obtain a copy of the License at
   9  *
  10  *     http://www.apache.org/licenses/LICENSE-2.0
  11  *
  12  * Unless required by applicable law or agreed to in writing, software
  13  * distributed under the License is distributed on an "AS IS" BASIS,
  14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15  * See the License for the specific language governing permissions and
  16  * limitations under the License.
  17  */
  18
  19 import org.apache.lucene.index.IndexReader;
  20 import org.apache.lucene.index.Term;
  21 import org.apache.lucene.index.TermEnum;
  22 import org.apache.lucene.analysis.Analyzer;
  23 import org.apache.lucene.analysis.TokenStream;
  24 import org.apache.lucene.analysis.StopFilter;
  25 import org.apache.lucene.util.StringHelper;
  26 import org.apache.lucene.util.Version;
  27
  28 import java.io.IOException;
  29 import java.io.Reader;
  30 import java.util.*;
  31
  32 /**
  33  * An {@link Analyzer} used primarily at query time to wrap another analyzer and provide a layer of protection
  34  * which prevents very common words from being passed into queries.
  35  * <p>
  36  * For very large indexes the cost
  37  * of reading TermDocs for a very common word can be  high. This analyzer was created after experience with
  38  * a 38 million doc index which had a term in around 50% of docs and was causing TermQueries for
  39  * this term to take 2 seconds.
  40  * </p>
  41  * <p>
  42  * Use the various "addStopWords" methods in this class to automate the identification and addition of
  43  * stop words found in an already existing index.
  44  * </p>
  45  */
  46 public final class QueryAutoStopWordAnalyzer extends Analyzer {
  47   Analyzer delegate;
  48   HashMap<String,HashSet<String>> stopWordsPerField = new HashMap<String,HashSet<String>>();
  49   //The default maximum percentage (40%) of index documents which
  50   //can contain a term, after which the term is considered to be a stop word.
  51   public static final float defaultMaxDocFreqPercent = 0.4f;
  52   private final Version matchVersion;
  53
  54   /**
  55    * Initializes this analyzer with the Analyzer object that actually produces the tokens
  56    *
  57    * @param delegate The choice of {@link Analyzer} that is used to produce the token stream which needs filtering
  58    */
  59   public QueryAutoStopWordAnalyzer(Version matchVersion, Analyzer delegate) {
  60     this.delegate = delegate;
  61     this.matchVersion = matchVersion;
  62   }
  63
  64   /**
  65    * Automatically adds stop words for all fields with terms exceeding the defaultMaxDocFreqPercent
  66    *
  67    * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
  68    *               exceed the required document frequency
  69    * @return The number of stop words identified.
  70    * @throws IOException
  71    */
  72   public int addStopWords(IndexReader reader) throws IOException {
  73     return addStopWords(reader, defaultMaxDocFreqPercent);
  74   }
  75
  76   /**
  77    * Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
  78    *
  79    * @param reader     The {@link IndexReader} which will be consulted to identify potential stop words that
  80    *                   exceed the required document frequency
  81    * @param maxDocFreq The maximum number of index documents which can contain a term, after which
  82    *                   the term is considered to be a stop word
  83    * @return The number of stop words identified.
  84    * @throws IOException
  85    */
  86   public int addStopWords(IndexReader reader, int maxDocFreq) throws IOException {
  87     int numStopWords = 0;
  88     Collection<String> fieldNames = reader.getFieldNames(IndexReader.FieldOption.INDEXED);
  89     for (Iterator<String> iter = fieldNames.iterator(); iter.hasNext();) {
  90       String fieldName = iter.next();
  91       numStopWords += addStopWords(reader, fieldName, maxDocFreq);
  92     }
  93     return numStopWords;
  94   }
  95
  96   /**
  97    * Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
  98    *
  99    * @param reader        The {@link IndexReader} which will be consulted to identify potential stop words that
 100    *                      exceed the required document frequency
 101    * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
 102    *                      contain a term, after which the word is considered to be a stop word.
 103    * @return The number of stop words identified.
 104    * @throws IOException
 105    */
 106   public int addStopWords(IndexReader reader, float maxPercentDocs) throws IOException {
 107     int numStopWords = 0;
 108     Collection<String> fieldNames = reader.getFieldNames(IndexReader.FieldOption.INDEXED);
 109     for (Iterator<String> iter = fieldNames.iterator(); iter.hasNext();) {
 110       String fieldName = iter.next();
 111       numStopWords += addStopWords(reader, fieldName, maxPercentDocs);
 112     }
 113     return numStopWords;
 114   }
 115
 116   /**
 117    * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
 118    *
 119    * @param reader         The {@link IndexReader} which will be consulted to identify potential stop words that
 120    *                       exceed the required document frequency
 121    * @param fieldName      The field for which stopwords will be added
 122    * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
 123    *                       contain a term, after which the word is considered to be a stop word.
 124    * @return The number of stop words identified.
 125    * @throws IOException
 126    */
 127   public int addStopWords(IndexReader reader, String fieldName, float maxPercentDocs) throws IOException {
 128     return addStopWords(reader, fieldName, (int) (reader.numDocs() * maxPercentDocs));
 129   }
 130
 131   /**
 132    * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
 133    *
 134    * @param reader     The {@link IndexReader} which will be consulted to identify potential stop words that
 135    *                   exceed the required document frequency
 136    * @param fieldName  The field for which stopwords will be added
 137    * @param maxDocFreq The maximum number of index documents which
 138    *                   can contain a term, after which the term is considered to be a stop word.
 139    * @return The number of stop words identified.
 140    * @throws IOException
 141    */
 142   public int addStopWords(IndexReader reader, String fieldName, int maxDocFreq) throws IOException {
 143     HashSet<String> stopWords = new HashSet<String>();
 144     String internedFieldName = StringHelper.intern(fieldName);
 145     TermEnum te = reader.terms(new Term(fieldName));
 146     Term term = te.term();
 147     while (term != null) {
 148       if (term.field() != internedFieldName) {
 149         break;
 150       }
 151       if (te.docFreq() > maxDocFreq) {
 152         stopWords.add(term.text());
 153       }
 154       if (!te.next()) {
 155         break;
 156       }
 157       term = te.term();
 158     }
 159     stopWordsPerField.put(fieldName, stopWords);
 160
 161     /* if the stopwords for a field are changed,
 162      * then saved streams for that field are erased.
 163      */
 164     @SuppressWarnings("unchecked")
 165     Map<String,SavedStreams> streamMap = (Map<String,SavedStreams>) getPreviousTokenStream();
 166     if (streamMap != null)
 167       streamMap.remove(fieldName);
 168
 169     return stopWords.size();
 170   }
 171
 172   @Override
 173   public TokenStream tokenStream(String fieldName, Reader reader) {
 174     TokenStream result;
 175     try {
 176       result = delegate.reusableTokenStream(fieldName, reader);
 177     } catch (IOException e) {
 178       result = delegate.tokenStream(fieldName, reader);
 179     }
 180     HashSet<String> stopWords = stopWordsPerField.get(fieldName);
 181     if (stopWords != null) {
 182       result = new StopFilter(matchVersion, result, stopWords);
 183     }
 184     return result;
 185   }
 186
 187   private class SavedStreams {
 188     /* the underlying stream */
 189     TokenStream wrapped;
 190
 191     /*
 192      * when there are no stopwords for the field, refers to wrapped.
 193      * if there stopwords, it is a StopFilter around wrapped.
 194      */
 195     TokenStream withStopFilter;
 196   }
 197
 198   @Override
 199   public TokenStream reusableTokenStream(String fieldName, Reader reader)
 200       throws IOException {
 201     /* map of SavedStreams for each field */
 202     @SuppressWarnings("unchecked")
 203     Map<String,SavedStreams> streamMap = (Map<String,SavedStreams>) getPreviousTokenStream();
 204     if (streamMap == null) {
 205       streamMap = new HashMap<String, SavedStreams>();
 206       setPreviousTokenStream(streamMap);
 207     }
 208
 209     SavedStreams streams = streamMap.get(fieldName);
 210     if (streams == null) {
 211       /* an entry for this field does not exist, create one */
 212       streams = new SavedStreams();
 213       streamMap.put(fieldName, streams);
 214       streams.wrapped = delegate.reusableTokenStream(fieldName, reader);
 215
 216       /* if there are any stopwords for the field, save the stopfilter */
 217       HashSet<String> stopWords = stopWordsPerField.get(fieldName);
 218       if (stopWords != null)
 219         streams.withStopFilter = new StopFilter(matchVersion, streams.wrapped, stopWords);
 220       else
 221         streams.withStopFilter = streams.wrapped;
 222
 223     } else {
 224       /*
 225        * an entry for this field exists, verify the wrapped stream has not
 226        * changed. if it has not, reuse it, otherwise wrap the new stream.
 227        */
 228       TokenStream result = delegate.reusableTokenStream(fieldName, reader);
 229       if (result == streams.wrapped) {
 230         /* the wrapped analyzer reused the stream */
 231       } else {
 232         /*
 233          * the wrapped analyzer did not. if there are any stopwords for the
 234          * field, create a new StopFilter around the new stream
 235          */
 236         streams.wrapped = result;
 237         HashSet<String> stopWords = stopWordsPerField.get(fieldName);
 238         if (stopWords != null)
 239           streams.withStopFilter = new StopFilter(matchVersion, streams.wrapped, stopWords);
 240         else
 241           streams.withStopFilter = streams.wrapped;
 242       }
 243     }
 244
 245     return streams.withStopFilter;
 246   }
 247
 248   /**
 249    * Provides information on which stop words have been identified for a field
 250    *
 251    * @param fieldName The field for which stop words identified in "addStopWords"
 252    *                  method calls will be returned
 253    * @return the stop words identified for a field
 254    */
 255   public String[] getStopWords(String fieldName) {
 256     String[] result;
 257     HashSet<String> stopWords = stopWordsPerField.get(fieldName);
 258     if (stopWords != null) {
 259       result = stopWords.toArray(new String[stopWords.size()]);
 260     } else {
 261       result = new String[0];
 262     }
 263     return result;
 264   }
 265
 266   /**
 267    * Provides information on which stop words have been identified for all fields
 268    *
 269    * @return the stop words (as terms)
 270    */
 271   public Term[] getStopWords() {
 272     ArrayList<Term> allStopWords = new ArrayList<Term>();
 273     for (Iterator<String> iter = stopWordsPerField.keySet().iterator(); iter.hasNext();) {
 274       String fieldName = iter.next();
 275       HashSet<String> stopWords = stopWordsPerField.get(fieldName);
 276       for (Iterator<String> iterator = stopWords.iterator(); iterator.hasNext();) {
 277         String text = iterator.next();
 278         allStopWords.add(new Term(fieldName, text));
 279       }
 280     }
 281     return allStopWords.toArray(new Term[allStopWords.size()]);
 282         }
 283
 284 }