lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java

   1 package org.apache.lucene.analysis.standard;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import org.apache.lucene.analysis.LowerCaseFilter;
  21 import org.apache.lucene.analysis.StopAnalyzer;
  22 import org.apache.lucene.analysis.StopFilter;
  23 import org.apache.lucene.analysis.StopwordAnalyzerBase;
  24 import org.apache.lucene.analysis.TokenStream;
  25 import org.apache.lucene.analysis.WordlistLoader;
  26 import org.apache.lucene.util.Version;
  27
  28 import java.io.File;
  29 import java.io.IOException;
  30 import java.io.Reader;
  31 import java.util.Set;
  32
  33 /**
  34  * Filters {@link ClassicTokenizer} with {@link ClassicFilter}, {@link
  35  * LowerCaseFilter} and {@link StopFilter}, using a list of
  36  * English stop words.
  37  *
  38  * <a name="version"/>
  39  * <p>You must specify the required {@link Version}
  40  * compatibility when creating ClassicAnalyzer:
  41  * <ul>
  42  *   <li> As of 3.1, StopFilter correctly handles Unicode 4.0
  43  *         supplementary characters in stopwords
  44  *   <li> As of 2.9, StopFilter preserves position
  45  *        increments
  46  *   <li> As of 2.4, Tokens incorrectly identified as acronyms
  47  *        are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
  48  * </ul>
  49  *
  50  * ClassicAnalyzer was named StandardAnalyzer in Lucene versions prior to 3.1.
  51  * As of 3.1, {@link StandardAnalyzer} implements Unicode text segmentation,
  52  * as specified by UAX#29.
  53  */
  54 public final class ClassicAnalyzer extends StopwordAnalyzerBase {
  55
  56   /** Default maximum allowed token length */
  57   public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
  58
  59   private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
  60
  61   /**
  62    * Specifies whether deprecated acronyms should be replaced with HOST type.
  63    * See {@linkplain "https://issues.apache.org/jira/browse/LUCENE-1068"}
  64    */
  65   private final boolean replaceInvalidAcronym;
  66
  67   /** An unmodifiable set containing some common English words that are usually not
  68   useful for searching. */
  69   public static final Set<?> STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
  70
  71   /** Builds an analyzer with the given stop words.
  72    * @param matchVersion Lucene version to match See {@link
  73    * <a href="#version">above</a>}
  74    * @param stopWords stop words */
  75   public ClassicAnalyzer(Version matchVersion, Set<?> stopWords) {
  76     super(matchVersion, stopWords);
  77     replaceInvalidAcronym = matchVersion.onOrAfter(Version.LUCENE_24);
  78   }
  79
  80   /** Builds an analyzer with the default stop words ({@link
  81    * #STOP_WORDS_SET}).
  82    * @param matchVersion Lucene version to match See {@link
  83    * <a href="#version">above</a>}
  84    */
  85   public ClassicAnalyzer(Version matchVersion) {
  86     this(matchVersion, STOP_WORDS_SET);
  87   }
  88
  89   /** Builds an analyzer with the stop words from the given file.
  90    * @see WordlistLoader#getWordSet(File)
  91    * @param matchVersion Lucene version to match See {@link
  92    * <a href="#version">above</a>}
  93    * @param stopwords File to read stop words from */
  94   public ClassicAnalyzer(Version matchVersion, File stopwords) throws IOException {
  95     this(matchVersion, WordlistLoader.getWordSet(stopwords));
  96   }
  97
  98   /** Builds an analyzer with the stop words from the given reader.
  99    * @see WordlistLoader#getWordSet(Reader)
 100    * @param matchVersion Lucene version to match See {@link
 101    * <a href="#version">above</a>}
 102    * @param stopwords Reader to read stop words from */
 103   public ClassicAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
 104     this(matchVersion, WordlistLoader.getWordSet(stopwords));
 105   }
 106
 107   /**
 108    * Set maximum allowed token length.  If a token is seen
 109    * that exceeds this length then it is discarded.  This
 110    * setting only takes effect the next time tokenStream or
 111    * reusableTokenStream is called.
 112    */
 113   public void setMaxTokenLength(int length) {
 114     maxTokenLength = length;
 115   }
 116
 117   /**
 118    * @see #setMaxTokenLength
 119    */
 120   public int getMaxTokenLength() {
 121     return maxTokenLength;
 122   }
 123
 124   @Override
 125   protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
 126     final ClassicTokenizer src = new ClassicTokenizer(matchVersion, reader);
 127     src.setMaxTokenLength(maxTokenLength);
 128     src.setReplaceInvalidAcronym(replaceInvalidAcronym);
 129     TokenStream tok = new ClassicFilter(src);
 130     tok = new LowerCaseFilter(matchVersion, tok);
 131     tok = new StopFilter(matchVersion, tok, stopwords);
 132     return new TokenStreamComponents(src, tok) {
 133       @Override
 134       protected boolean reset(final Reader reader) throws IOException {
 135         src.setMaxTokenLength(ClassicAnalyzer.this.maxTokenLength);
 136         return super.reset(reader);
 137       }
 138     };
 139   }
 140 }