lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/StopFilter.java

   1 package org.apache.lucene.analysis;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.util.Arrays;
  22 import java.util.Set;
  23 import java.util.List;
  24
  25 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  26 import org.apache.lucene.queryParser.QueryParser; // for javadoc
  27 import org.apache.lucene.util.Version;
  28
  29 /**
  30  * Removes stop words from a token stream.
  31  *
  32  * <a name="version"/>
  33  * <p>You must specify the required {@link Version}
  34  * compatibility when creating StopFilter:
  35  * <ul>
  36  *   <li> As of 3.1, StopFilter correctly handles Unicode 4.0
  37  *         supplementary characters in stopwords and position
  38  *         increments are preserved
  39  * </ul>
  40  */
  41 public final class StopFilter extends FilteringTokenFilter {
  42
  43   private final CharArraySet stopWords;
  44   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  45
  46   /**
  47    * Construct a token stream filtering the given input.
  48    * If <code>stopWords</code> is an instance of {@link CharArraySet} (true if
  49    * <code>makeStopSet()</code> was used to construct the set) it will be directly used
  50    * and <code>ignoreCase</code> will be ignored since <code>CharArraySet</code>
  51    * directly controls case sensitivity.
  52    * <p/>
  53    * If <code>stopWords</code> is not an instance of {@link CharArraySet},
  54    * a new CharArraySet will be constructed and <code>ignoreCase</code> will be
  55    * used to specify the case sensitivity of that set.
  56    *
  57    * @param enablePositionIncrements true if token positions should record the removed stop words
  58    * @param input Input TokenStream
  59    * @param stopWords A Set of Strings or char[] or any other toString()-able set representing the stopwords
  60    * @param ignoreCase if true, all words are lower cased first
  61    * @deprecated use {@link #StopFilter(Version, TokenStream, Set, boolean)} instead
  62    */
  63   @Deprecated
  64   public StopFilter(boolean enablePositionIncrements, TokenStream input, Set<?> stopWords, boolean ignoreCase)
  65   {
  66     this(Version.LUCENE_30, enablePositionIncrements, input, stopWords, ignoreCase);
  67   }
  68
  69   /**
  70    * Construct a token stream filtering the given input. If
  71    * <code>stopWords</code> is an instance of {@link CharArraySet} (true if
  72    * <code>makeStopSet()</code> was used to construct the set) it will be
  73    * directly used and <code>ignoreCase</code> will be ignored since
  74    * <code>CharArraySet</code> directly controls case sensitivity.
  75    * <p/>
  76    * If <code>stopWords</code> is not an instance of {@link CharArraySet}, a new
  77    * CharArraySet will be constructed and <code>ignoreCase</code> will be used
  78    * to specify the case sensitivity of that set.
  79    *
  80    * @param matchVersion
  81    *          Lucene version to enable correct Unicode 4.0 behavior in the stop
  82    *          set if Version > 3.0. See <a href="#version">above</a> for details.
  83    * @param input
  84    *          Input TokenStream
  85    * @param stopWords
  86    *          A Set of Strings or char[] or any other toString()-able set
  87    *          representing the stopwords
  88    * @param ignoreCase
  89    *          if true, all words are lower cased first
  90    */
  91   public StopFilter(Version matchVersion, TokenStream input, Set<?> stopWords, boolean ignoreCase)
  92   {
  93    this(matchVersion, matchVersion.onOrAfter(Version.LUCENE_29), input, stopWords, ignoreCase);
  94   }
  95
  96   /*
  97    * convenience ctor to enable deprecated ctors to set posInc explicitly
  98    */
  99   private StopFilter(Version matchVersion, boolean enablePositionIncrements, TokenStream input, Set<?> stopWords, boolean ignoreCase){
 100     super(enablePositionIncrements, input);
 101     this.stopWords = stopWords instanceof CharArraySet ? (CharArraySet)stopWords : new CharArraySet(matchVersion, stopWords, ignoreCase);
 102   }
 103
 104   /**
 105    * Constructs a filter which removes words from the input
 106    * TokenStream that are named in the Set.
 107    *
 108    * @param enablePositionIncrements true if token positions should record the removed stop words
 109    * @param in Input stream
 110    * @param stopWords A Set of Strings or char[] or any other toString()-able set representing the stopwords
 111    * @see #makeStopSet(Version, java.lang.String[])
 112    * @deprecated use {@link #StopFilter(Version, TokenStream, Set)} instead
 113    */
 114   @Deprecated
 115   public StopFilter(boolean enablePositionIncrements, TokenStream in, Set<?> stopWords) {
 116     this(Version.LUCENE_30, enablePositionIncrements, in, stopWords, false);
 117   }
 118
 119   /**
 120    * Constructs a filter which removes words from the input TokenStream that are
 121    * named in the Set.
 122    *
 123    * @param matchVersion
 124    *          Lucene version to enable correct Unicode 4.0 behavior in the stop
 125    *          set if Version > 3.0.  See <a href="#version">above</a> for details.
 126    * @param in
 127    *          Input stream
 128    * @param stopWords
 129    *          A Set of Strings or char[] or any other toString()-able set
 130    *          representing the stopwords
 131    * @see #makeStopSet(Version, java.lang.String[])
 132    */
 133   public StopFilter(Version matchVersion, TokenStream in, Set<?> stopWords) {
 134     this(matchVersion, in, stopWords, false);
 135   }
 136
 137   /**
 138    * Builds a Set from an array of stop words,
 139    * appropriate for passing into the StopFilter constructor.
 140    * This permits this stopWords construction to be cached once when
 141    * an Analyzer is constructed.
 142    *
 143    * @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
 144    * @deprecated use {@link #makeStopSet(Version, String...)} instead
 145    */
 146   @Deprecated
 147   public static final Set<Object> makeStopSet(String... stopWords) {
 148     return makeStopSet(Version.LUCENE_30, stopWords, false);
 149   }
 150
 151   /**
 152    * Builds a Set from an array of stop words,
 153    * appropriate for passing into the StopFilter constructor.
 154    * This permits this stopWords construction to be cached once when
 155    * an Analyzer is constructed.
 156    *
 157    * @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
 158    * @param stopWords An array of stopwords
 159    * @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
 160    */
 161   public static final Set<Object> makeStopSet(Version matchVersion, String... stopWords) {
 162     return makeStopSet(matchVersion, stopWords, false);
 163   }
 164
 165   /**
 166    * Builds a Set from an array of stop words,
 167    * appropriate for passing into the StopFilter constructor.
 168    * This permits this stopWords construction to be cached once when
 169    * an Analyzer is constructed.
 170    * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
 171    * @return A Set ({@link CharArraySet}) containing the words
 172    * @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
 173    * @deprecated use {@link #makeStopSet(Version, List)} instead
 174    */
 175   @Deprecated
 176   public static final Set<Object> makeStopSet(List<?> stopWords) {
 177     return makeStopSet(Version.LUCENE_30, stopWords, false);
 178   }
 179
 180   /**
 181    * Builds a Set from an array of stop words,
 182    * appropriate for passing into the StopFilter constructor.
 183    * This permits this stopWords construction to be cached once when
 184    * an Analyzer is constructed.
 185    *
 186    * @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
 187    * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
 188    * @return A Set ({@link CharArraySet}) containing the words
 189    * @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
 190    */
 191   public static final Set<Object> makeStopSet(Version matchVersion, List<?> stopWords) {
 192     return makeStopSet(matchVersion, stopWords, false);
 193   }
 194
 195   /**
 196    * Creates a stopword set from the given stopword array.
 197    * @param stopWords An array of stopwords
 198    * @param ignoreCase If true, all words are lower cased first.
 199    * @return a Set containing the words
 200    * @deprecated use {@link #makeStopSet(Version, String[], boolean)} instead;
 201    */
 202   @Deprecated
 203   public static final Set<Object> makeStopSet(String[] stopWords, boolean ignoreCase) {
 204     return makeStopSet(Version.LUCENE_30, stopWords, ignoreCase);
 205   }
 206   /**
 207    * Creates a stopword set from the given stopword array.
 208    *
 209    * @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
 210    * @param stopWords An array of stopwords
 211    * @param ignoreCase If true, all words are lower cased first.
 212    * @return a Set containing the words
 213    */
 214   public static final Set<Object> makeStopSet(Version matchVersion, String[] stopWords, boolean ignoreCase) {
 215     CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.length, ignoreCase);
 216     stopSet.addAll(Arrays.asList(stopWords));
 217     return stopSet;
 218   }
 219
 220   /**
 221    * Creates a stopword set from the given stopword list.
 222    * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
 223    * @param ignoreCase if true, all words are lower cased first
 224    * @return A Set ({@link CharArraySet}) containing the words
 225    * @deprecated use {@link #makeStopSet(Version, List, boolean)} instead
 226    */
 227   @Deprecated
 228   public static final Set<Object> makeStopSet(List<?> stopWords, boolean ignoreCase){
 229     return makeStopSet(Version.LUCENE_30, stopWords, ignoreCase);
 230   }
 231
 232   /**
 233    * Creates a stopword set from the given stopword list.
 234    * @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
 235    * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
 236    * @param ignoreCase if true, all words are lower cased first
 237    * @return A Set ({@link CharArraySet}) containing the words
 238    */
 239   public static final Set<Object> makeStopSet(Version matchVersion, List<?> stopWords, boolean ignoreCase){
 240     CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.size(), ignoreCase);
 241     stopSet.addAll(stopWords);
 242     return stopSet;
 243   }
 244
 245   /**
 246    * Returns the next input Token whose term() is not a stop word.
 247    */
 248   @Override
 249   protected boolean accept() throws IOException {
 250     return !stopWords.contains(termAtt.buffer(), 0, termAtt.length());
 251   }
 252
 253   /**
 254    * Returns version-dependent default for
 255    * enablePositionIncrements.  Analyzers that embed
 256    * StopFilter use this method when creating the
 257    * StopFilter.  Prior to 2.9, this returns false.  On 2.9
 258    * or later, it returns true.
 259    * @deprecated use {@link #StopFilter(Version, TokenStream, Set)} instead
 260    */
 261   @Deprecated
 262   public static boolean getEnablePositionIncrementsVersionDefault(Version matchVersion) {
 263     return matchVersion.onOrAfter(Version.LUCENE_29);
 264   }
 265 }