lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java

   1 package org.apache.lucene.analysis.snowball;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import org.apache.lucene.analysis.*;
  21 import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
  22 import org.apache.lucene.analysis.standard.*;
  23 import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter;
  24 import org.apache.lucene.util.Version;
  25
  26 import java.io.IOException;
  27 import java.io.Reader;
  28 import java.util.Set;
  29
  30 /** Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
  31  * LowerCaseFilter}, {@link StopFilter} and {@link SnowballFilter}.
  32  *
  33  * Available stemmers are listed in org.tartarus.snowball.ext.  The name of a
  34  * stemmer is the part of the class name before "Stemmer", e.g., the stemmer in
  35  * {@link org.tartarus.snowball.ext.EnglishStemmer} is named "English".
  36  *
  37  * <p><b>NOTE</b>: This class uses the same {@link Version}
  38  * dependent settings as {@link StandardAnalyzer}, with the following addition:
  39  * <ul>
  40  *   <li> As of 3.1, uses {@link TurkishLowerCaseFilter} for Turkish language.
  41  * </ul>
  42  * </p>
  43  * @deprecated Use the language-specific analyzer in contrib/analyzers instead.
  44  * This analyzer will be removed in Lucene 5.0
  45  */
  46 @Deprecated
  47 public final class SnowballAnalyzer extends Analyzer {
  48   private String name;
  49   private Set<?> stopSet;
  50   private final Version matchVersion;
  51
  52   /** Builds the named analyzer with no stop words. */
  53   public SnowballAnalyzer(Version matchVersion, String name) {
  54     this.name = name;
  55     this.matchVersion = matchVersion;
  56   }
  57
  58   /**
  59    * Builds the named analyzer with the given stop words.
  60    * @deprecated Use {@link #SnowballAnalyzer(Version, String, Set)} instead.
  61    */
  62   @Deprecated
  63   public SnowballAnalyzer(Version matchVersion, String name, String[] stopWords) {
  64     this(matchVersion, name);
  65     stopSet = StopFilter.makeStopSet(matchVersion, stopWords);
  66   }
  67
  68   /** Builds the named analyzer with the given stop words. */
  69   public SnowballAnalyzer(Version matchVersion, String name, Set<?> stopWords) {
  70     this(matchVersion, name);
  71     stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion,
  72         stopWords));
  73   }
  74
  75   /** Constructs a {@link StandardTokenizer} filtered by a {@link
  76       StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter},
  77       and a {@link SnowballFilter} */
  78   @Override
  79   public TokenStream tokenStream(String fieldName, Reader reader) {
  80     TokenStream result = new StandardTokenizer(matchVersion, reader);
  81     result = new StandardFilter(matchVersion, result);
  82     // remove the possessive 's for english stemmers
  83     if (matchVersion.onOrAfter(Version.LUCENE_31) &&
  84         (name.equals("English") || name.equals("Porter") || name.equals("Lovins")))
  85       result = new EnglishPossessiveFilter(result);
  86     // Use a special lowercase filter for turkish, the stemmer expects it.
  87     if (matchVersion.onOrAfter(Version.LUCENE_31) && name.equals("Turkish"))
  88       result = new TurkishLowerCaseFilter(result);
  89     else
  90       result = new LowerCaseFilter(matchVersion, result);
  91     if (stopSet != null)
  92       result = new StopFilter(matchVersion,
  93                               result, stopSet);
  94     result = new SnowballFilter(result, name);
  95     return result;
  96   }
  97
  98   private class SavedStreams {
  99     Tokenizer source;
 100     TokenStream result;
 101   }
 102
 103   /** Returns a (possibly reused) {@link StandardTokenizer} filtered by a
 104    * {@link StandardFilter}, a {@link LowerCaseFilter},
 105    * a {@link StopFilter}, and a {@link SnowballFilter} */
 106   @Override
 107   public TokenStream reusableTokenStream(String fieldName, Reader reader)
 108       throws IOException {
 109     SavedStreams streams = (SavedStreams) getPreviousTokenStream();
 110     if (streams == null) {
 111       streams = new SavedStreams();
 112       streams.source = new StandardTokenizer(matchVersion, reader);
 113       streams.result = new StandardFilter(matchVersion, streams.source);
 114       // Use a special lowercase filter for turkish, the stemmer expects it.
 115       if (matchVersion.onOrAfter(Version.LUCENE_31) && name.equals("Turkish"))
 116         streams.result = new TurkishLowerCaseFilter(streams.result);
 117       else
 118         streams.result = new LowerCaseFilter(matchVersion, streams.result);
 119       if (stopSet != null)
 120         streams.result = new StopFilter(matchVersion,
 121                                         streams.result, stopSet);
 122       streams.result = new SnowballFilter(streams.result, name);
 123       setPreviousTokenStream(streams);
 124     } else {
 125       streams.source.reset(reader);
 126     }
 127     return streams.result;
 128   }
 129 }