lucene-java-3.5.0/lucene/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java

   1 package org.apache.lucene.analysis.standard;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import org.apache.lucene.analysis.LowerCaseFilter;
  21 import org.apache.lucene.analysis.StopAnalyzer;
  22 import org.apache.lucene.analysis.StopFilter;
  23 import org.apache.lucene.analysis.StopwordAnalyzerBase;
  24 import org.apache.lucene.analysis.TokenStream;
  25 import org.apache.lucene.analysis.WordlistLoader;
  26 import org.apache.lucene.util.IOUtils;
  27 import org.apache.lucene.util.Version;
  28
  29 import java.io.File;
  30 import java.io.IOException;
  31 import java.io.Reader;
  32 import java.util.Set;
  33
  34 /**
  35  * Filters {@link ClassicTokenizer} with {@link ClassicFilter}, {@link
  36  * LowerCaseFilter} and {@link StopFilter}, using a list of
  37  * English stop words.
  38  *
  39  * <a name="version"/>
  40  * <p>You must specify the required {@link Version}
  41  * compatibility when creating ClassicAnalyzer:
  42  * <ul>
  43  *   <li> As of 3.1, StopFilter correctly handles Unicode 4.0
  44  *         supplementary characters in stopwords
  45  *   <li> As of 2.9, StopFilter preserves position
  46  *        increments
  47  *   <li> As of 2.4, Tokens incorrectly identified as acronyms
  48  *        are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
  49  * </ul>
  50  *
  51  * ClassicAnalyzer was named StandardAnalyzer in Lucene versions prior to 3.1.
  52  * As of 3.1, {@link StandardAnalyzer} implements Unicode text segmentation,
  53  * as specified by UAX#29.
  54  */
  55 public final class ClassicAnalyzer extends StopwordAnalyzerBase {
  56
  57   /** Default maximum allowed token length */
  58   public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
  59
  60   private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
  61
  62   /**
  63    * Specifies whether deprecated acronyms should be replaced with HOST type.
  64    * See {@linkplain "https://issues.apache.org/jira/browse/LUCENE-1068"}
  65    */
  66   private final boolean replaceInvalidAcronym;
  67
  68   /** An unmodifiable set containing some common English words that are usually not
  69   useful for searching. */
  70   public static final Set<?> STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
  71
  72   /** Builds an analyzer with the given stop words.
  73    * @param matchVersion Lucene version to match See {@link
  74    * <a href="#version">above</a>}
  75    * @param stopWords stop words */
  76   public ClassicAnalyzer(Version matchVersion, Set<?> stopWords) {
  77     super(matchVersion, stopWords);
  78     replaceInvalidAcronym = matchVersion.onOrAfter(Version.LUCENE_24);
  79   }
  80
  81   /** Builds an analyzer with the default stop words ({@link
  82    * #STOP_WORDS_SET}).
  83    * @param matchVersion Lucene version to match See {@link
  84    * <a href="#version">above</a>}
  85    */
  86   public ClassicAnalyzer(Version matchVersion) {
  87     this(matchVersion, STOP_WORDS_SET);
  88   }
  89
  90   /** Builds an analyzer with the stop words from the given file.
  91    * @see WordlistLoader#getWordSet(Reader, Version)
  92    * @param matchVersion Lucene version to match See {@link
  93    * <a href="#version">above</a>}
  94    * @param stopwords File to read stop words from */
  95   public ClassicAnalyzer(Version matchVersion, File stopwords) throws IOException {
  96     this(matchVersion, WordlistLoader.getWordSet(IOUtils.getDecodingReader(stopwords,
  97         IOUtils.CHARSET_UTF_8), matchVersion));
  98   }
  99
 100   /** Builds an analyzer with the stop words from the given reader.
 101    * @see WordlistLoader#getWordSet(Reader, Version)
 102    * @param matchVersion Lucene version to match See {@link
 103    * <a href="#version">above</a>}
 104    * @param stopwords Reader to read stop words from */
 105   public ClassicAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
 106     this(matchVersion, WordlistLoader.getWordSet(stopwords, matchVersion));
 107   }
 108
 109   /**
 110    * Set maximum allowed token length.  If a token is seen
 111    * that exceeds this length then it is discarded.  This
 112    * setting only takes effect the next time tokenStream or
 113    * reusableTokenStream is called.
 114    */
 115   public void setMaxTokenLength(int length) {
 116     maxTokenLength = length;
 117   }
 118
 119   /**
 120    * @see #setMaxTokenLength
 121    */
 122   public int getMaxTokenLength() {
 123     return maxTokenLength;
 124   }
 125
 126   @Override
 127   protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
 128     final ClassicTokenizer src = new ClassicTokenizer(matchVersion, reader);
 129     src.setMaxTokenLength(maxTokenLength);
 130     src.setReplaceInvalidAcronym(replaceInvalidAcronym);
 131     TokenStream tok = new ClassicFilter(src);
 132     tok = new LowerCaseFilter(matchVersion, tok);
 133     tok = new StopFilter(matchVersion, tok, stopwords);
 134     return new TokenStreamComponents(src, tok) {
 135       @Override
 136       protected boolean reset(final Reader reader) throws IOException {
 137         src.setMaxTokenLength(ClassicAnalyzer.this.maxTokenLength);
 138         return super.reset(reader);
 139       }
 140     };
 141   }
 142 }