lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java

   1 package org.apache.lucene.analysis.de;
   2 // This file is encoded in UTF-8
   3
   4 /**
   5  * Licensed to the Apache Software Foundation (ASF) under one or more
   6  * contributor license agreements.  See the NOTICE file distributed with
   7  * this work for additional information regarding copyright ownership.
   8  * The ASF licenses this file to You under the Apache License, Version 2.0
   9  * (the "License"); you may not use this file except in compliance with
  10  * the License.  You may obtain a copy of the License at
  11  *
  12  *     http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  * Unless required by applicable law or agreed to in writing, software
  15  * distributed under the License is distributed on an "AS IS" BASIS,
  16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  * See the License for the specific language governing permissions and
  18  * limitations under the License.
  19  */
  20
  21 import java.io.File;
  22 import java.io.IOException;
  23 import java.io.Reader;
  24 import java.util.Arrays;
  25 import java.util.HashSet;
  26 import java.util.Map;
  27 import java.util.Set;
  28
  29 import org.apache.lucene.analysis.Analyzer;
  30 import org.apache.lucene.analysis.CharArraySet;
  31 import org.apache.lucene.analysis.LowerCaseFilter;
  32 import org.apache.lucene.analysis.KeywordMarkerFilter;
  33 import org.apache.lucene.analysis.StopFilter;
  34 import org.apache.lucene.analysis.StopwordAnalyzerBase;
  35 import org.apache.lucene.analysis.TokenStream;
  36 import org.apache.lucene.analysis.Tokenizer;
  37 import org.apache.lucene.analysis.WordlistLoader;
  38 import org.apache.lucene.analysis.snowball.SnowballFilter;
  39 import org.apache.lucene.analysis.standard.StandardAnalyzer;
  40 import org.apache.lucene.analysis.standard.StandardFilter;
  41 import org.apache.lucene.analysis.standard.StandardTokenizer;
  42 import org.apache.lucene.util.Version;
  43 import org.tartarus.snowball.ext.German2Stemmer;
  44
  45 /**
  46  * {@link Analyzer} for German language.
  47  * <p>
  48  * Supports an external list of stopwords (words that
  49  * will not be indexed at all) and an external list of exclusions (word that will
  50  * not be stemmed, but indexed).
  51  * A default set of stopwords is used unless an alternative list is specified, but the
  52  * exclusion list is empty by default.
  53  * </p>
  54  *
  55  * <a name="version"/>
  56  * <p>You must specify the required {@link Version}
  57  * compatibility when creating GermanAnalyzer:
  58  * <ul>
  59  *   <li> As of 3.1, Snowball stemming is done with SnowballFilter, and
  60  *        Snowball stopwords are used by default.
  61  *   <li> As of 2.9, StopFilter preserves position
  62  *        increments
  63  * </ul>
  64  *
  65  * <p><b>NOTE</b>: This class uses the same {@link Version}
  66  * dependent settings as {@link StandardAnalyzer}.</p>
  67  */
  68 public final class GermanAnalyzer extends StopwordAnalyzerBase {
  69
  70   /**
  71    * List of typical german stopwords.
  72    * @deprecated use {@link #getDefaultStopSet()} instead
  73    */
  74   @Deprecated
  75   public final static String[] GERMAN_STOP_WORDS = {
  76     "einer", "eine", "eines", "einem", "einen",
  77     "der", "die", "das", "dass", "daß",
  78     "du", "er", "sie", "es",
  79     "was", "wer", "wie", "wir",
  80     "und", "oder", "ohne", "mit",
  81     "am", "im", "in", "aus", "auf",
  82     "ist", "sein", "war", "wird",
  83     "ihr", "ihre", "ihres",
  84     "als", "für", "von", "mit",
  85     "dich", "dir", "mich", "mir",
  86     "mein", "sein", "kein",
  87     "durch", "wegen", "wird"
  88   };
  89
  90   /** File containing default German stopwords. */
  91   public final static String DEFAULT_STOPWORD_FILE = "german_stop.txt";
  92
  93   /**
  94    * Returns a set of default German-stopwords
  95    * @return a set of default German-stopwords
  96    */
  97   public static final Set<?> getDefaultStopSet(){
  98     return DefaultSetHolder.DEFAULT_SET;
  99   }
 100
 101   private static class DefaultSetHolder {
 102     /** @deprecated remove in Lucene 5.0 */
 103     @Deprecated
 104     private static final Set<?> DEFAULT_SET_30 = CharArraySet.unmodifiableSet(new CharArraySet(
 105         Version.LUCENE_CURRENT, Arrays.asList(GERMAN_STOP_WORDS), false));
 106     private static final Set<?> DEFAULT_SET;
 107     static {
 108       try {
 109         DEFAULT_SET =
 110           WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
 111       } catch (IOException ex) {
 112         // default set should always be present as it is part of the
 113         // distribution (JAR)
 114         throw new RuntimeException("Unable to load default stopword set");
 115       }
 116     }
 117   }
 118
 119   /**
 120    * Contains the stopwords used with the {@link StopFilter}.
 121    */
 122
 123   /**
 124    * Contains words that should be indexed but not stemmed.
 125    */
 126   // TODO make this final in 3.1
 127   private Set<?> exclusionSet;
 128
 129   /**
 130    * Builds an analyzer with the default stop words:
 131    * {@link #getDefaultStopSet()}.
 132    */
 133   public GermanAnalyzer(Version matchVersion) {
 134     this(matchVersion,
 135         matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_SET
 136             : DefaultSetHolder.DEFAULT_SET_30);
 137   }
 138
 139   /**
 140    * Builds an analyzer with the given stop words
 141    *
 142    * @param matchVersion
 143    *          lucene compatibility version
 144    * @param stopwords
 145    *          a stopword set
 146    */
 147   public GermanAnalyzer(Version matchVersion, Set<?> stopwords) {
 148     this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
 149   }
 150
 151   /**
 152    * Builds an analyzer with the given stop words
 153    *
 154    * @param matchVersion
 155    *          lucene compatibility version
 156    * @param stopwords
 157    *          a stopword set
 158    * @param stemExclusionSet
 159    *          a stemming exclusion set
 160    */
 161   public GermanAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
 162     super(matchVersion, stopwords);
 163     exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
 164   }
 165
 166   /**
 167    * Builds an analyzer with the given stop words.
 168    * @deprecated use {@link #GermanAnalyzer(Version, Set)}
 169    */
 170   @Deprecated
 171   public GermanAnalyzer(Version matchVersion, String... stopwords) {
 172     this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
 173   }
 174
 175   /**
 176    * Builds an analyzer with the given stop words.
 177    * @deprecated use {@link #GermanAnalyzer(Version, Set)}
 178    */
 179   @Deprecated
 180   public GermanAnalyzer(Version matchVersion, Map<?,?> stopwords) {
 181     this(matchVersion, stopwords.keySet());
 182
 183   }
 184
 185   /**
 186    * Builds an analyzer with the given stop words.
 187    * @deprecated use {@link #GermanAnalyzer(Version, Set)}
 188    */
 189   @Deprecated
 190   public GermanAnalyzer(Version matchVersion, File stopwords) throws IOException {
 191     this(matchVersion, WordlistLoader.getWordSet(stopwords));
 192   }
 193
 194   /**
 195    * Builds an exclusionlist from an array of Strings.
 196    * @deprecated use {@link #GermanAnalyzer(Version, Set, Set)} instead
 197    */
 198   @Deprecated
 199   public void setStemExclusionTable(String[] exclusionlist) {
 200     exclusionSet = StopFilter.makeStopSet(matchVersion, exclusionlist);
 201     setPreviousTokenStream(null); // force a new stemmer to be created
 202   }
 203
 204   /**
 205    * Builds an exclusionlist from a {@link Map}
 206    * @deprecated use {@link #GermanAnalyzer(Version, Set, Set)} instead
 207    */
 208   @Deprecated
 209   public void setStemExclusionTable(Map<?,?> exclusionlist) {
 210     exclusionSet = new HashSet<Object>(exclusionlist.keySet());
 211     setPreviousTokenStream(null); // force a new stemmer to be created
 212   }
 213
 214   /**
 215    * Builds an exclusionlist from the words contained in the given file.
 216    * @deprecated use {@link #GermanAnalyzer(Version, Set, Set)} instead
 217    */
 218   @Deprecated
 219   public void setStemExclusionTable(File exclusionlist) throws IOException {
 220     exclusionSet = WordlistLoader.getWordSet(exclusionlist);
 221     setPreviousTokenStream(null); // force a new stemmer to be created
 222   }
 223
 224   /**
 225    * Creates
 226    * {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
 227    * used to tokenize all the text in the provided {@link Reader}.
 228    *
 229    * @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
 230    *         built from a {@link StandardTokenizer} filtered with
 231    *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
 232    *         , {@link KeywordMarkerFilter} if a stem exclusion set is
 233    *         provided, and {@link SnowballFilter}
 234    */
 235   @Override
 236   protected TokenStreamComponents createComponents(String fieldName,
 237       Reader reader) {
 238     final Tokenizer source = new StandardTokenizer(matchVersion, reader);
 239     TokenStream result = new StandardFilter(matchVersion, source);
 240     result = new LowerCaseFilter(matchVersion, result);
 241     result = new StopFilter( matchVersion, result, stopwords);
 242     result = new KeywordMarkerFilter(result, exclusionSet);
 243     if (matchVersion.onOrAfter(Version.LUCENE_31))
 244       result = new SnowballFilter(result, new German2Stemmer());
 245     else
 246       result = new GermanStemFilter(result);
 247     return new TokenStreamComponents(source, result);
 248   }
 249 }