lucene-java-3.5.0/lucene/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java

   1 package org.apache.lucene.analysis.standard;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import org.apache.lucene.analysis.*;
  21 import org.apache.lucene.util.IOUtils;
  22 import org.apache.lucene.util.Version;
  23
  24 import java.io.File;
  25 import java.io.IOException;
  26 import java.io.Reader;
  27 import java.util.Set;
  28
  29 /**
  30  * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
  31  * LowerCaseFilter} and {@link StopFilter}, using a list of
  32  * English stop words.
  33  *
  34  * <a name="version"/>
  35  * <p>You must specify the required {@link Version}
  36  * compatibility when creating StandardAnalyzer:
  37  * <ul>
  38  *   <li> As of 3.4, Hiragana and Han characters are no longer wrongly split
  39  *        from their combining characters. If you use a previous version number,
  40  *        you get the exact broken behavior for backwards compatibility.
  41  *   <li> As of 3.1, StandardTokenizer implements Unicode text segmentation,
  42  *        and StopFilter correctly handles Unicode 4.0 supplementary characters
  43  *        in stopwords.  {@link ClassicTokenizer} and {@link ClassicAnalyzer}
  44  *        are the pre-3.1 implementations of StandardTokenizer and
  45  *        StandardAnalyzer.
  46  *   <li> As of 2.9, StopFilter preserves position increments
  47  *   <li> As of 2.4, Tokens incorrectly identified as acronyms
  48  *        are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
  49  * </ul>
  50  */
  51 public final class StandardAnalyzer extends StopwordAnalyzerBase {
  52
  53   /** Default maximum allowed token length */
  54   public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
  55
  56   private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
  57
  58   /**
  59    * Specifies whether deprecated acronyms should be replaced with HOST type.
  60    * See {@linkplain "https://issues.apache.org/jira/browse/LUCENE-1068"}
  61    */
  62   private final boolean replaceInvalidAcronym;
  63
  64   /** An unmodifiable set containing some common English words that are usually not
  65   useful for searching. */
  66   public static final Set<?> STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
  67
  68   /** Builds an analyzer with the given stop words.
  69    * @param matchVersion Lucene version to match See {@link
  70    * <a href="#version">above</a>}
  71    * @param stopWords stop words */
  72   public StandardAnalyzer(Version matchVersion, Set<?> stopWords) {
  73     super(matchVersion, stopWords);
  74     replaceInvalidAcronym = matchVersion.onOrAfter(Version.LUCENE_24);
  75   }
  76
  77   /** Builds an analyzer with the default stop words ({@link
  78    * #STOP_WORDS_SET}).
  79    * @param matchVersion Lucene version to match See {@link
  80    * <a href="#version">above</a>}
  81    */
  82   public StandardAnalyzer(Version matchVersion) {
  83     this(matchVersion, STOP_WORDS_SET);
  84   }
  85
  86   /** Builds an analyzer with the stop words from the given file.
  87    * @see WordlistLoader#getWordSet(Reader, Version)
  88    * @param matchVersion Lucene version to match See {@link
  89    * <a href="#version">above</a>}
  90    * @param stopwords File to read stop words from */
  91   public StandardAnalyzer(Version matchVersion, File stopwords) throws IOException {
  92     this(matchVersion, WordlistLoader.getWordSet(IOUtils.getDecodingReader(stopwords,
  93         IOUtils.CHARSET_UTF_8), matchVersion));
  94   }
  95
  96   /** Builds an analyzer with the stop words from the given reader.
  97    * @see WordlistLoader#getWordSet(Reader, Version)
  98    * @param matchVersion Lucene version to match See {@link
  99    * <a href="#version">above</a>}
 100    * @param stopwords Reader to read stop words from */
 101   public StandardAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
 102     this(matchVersion, WordlistLoader.getWordSet(stopwords, matchVersion));
 103   }
 104
 105   /**
 106    * Set maximum allowed token length.  If a token is seen
 107    * that exceeds this length then it is discarded.  This
 108    * setting only takes effect the next time tokenStream or
 109    * reusableTokenStream is called.
 110    */
 111   public void setMaxTokenLength(int length) {
 112     maxTokenLength = length;
 113   }
 114
 115   /**
 116    * @see #setMaxTokenLength
 117    */
 118   public int getMaxTokenLength() {
 119     return maxTokenLength;
 120   }
 121
 122   @Override
 123   protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
 124     final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
 125     src.setMaxTokenLength(maxTokenLength);
 126     src.setReplaceInvalidAcronym(replaceInvalidAcronym);
 127     TokenStream tok = new StandardFilter(matchVersion, src);
 128     tok = new LowerCaseFilter(matchVersion, tok);
 129     tok = new StopFilter(matchVersion, tok, stopwords);
 130     return new TokenStreamComponents(src, tok) {
 131       @Override
 132       protected boolean reset(final Reader reader) throws IOException {
 133         src.setMaxTokenLength(StandardAnalyzer.this.maxTokenLength);
 134         return super.reset(reader);
 135       }
 136     };
 137   }
 138 }