lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java

   1 package org.apache.lucene.analysis.de;
   2 // This file is encoded in UTF-8
   3
   4 /**
   5  * Licensed to the Apache Software Foundation (ASF) under one or more
   6  * contributor license agreements.  See the NOTICE file distributed with
   7  * this work for additional information regarding copyright ownership.
   8  * The ASF licenses this file to You under the Apache License, Version 2.0
   9  * (the "License"); you may not use this file except in compliance with
  10  * the License.  You may obtain a copy of the License at
  11  *
  12  *     http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  * Unless required by applicable law or agreed to in writing, software
  15  * distributed under the License is distributed on an "AS IS" BASIS,
  16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  * See the License for the specific language governing permissions and
  18  * limitations under the License.
  19  */
  20
  21 import java.io.File;
  22 import java.io.IOException;
  23 import java.io.Reader;
  24 import java.util.Arrays;
  25 import java.util.HashSet;
  26 import java.util.Map;
  27 import java.util.Set;
  28
  29 import org.apache.lucene.analysis.Analyzer;
  30 import org.apache.lucene.analysis.CharArraySet;
  31 import org.apache.lucene.analysis.LowerCaseFilter;
  32 import org.apache.lucene.analysis.KeywordMarkerFilter;
  33 import org.apache.lucene.analysis.StopFilter;
  34 import org.apache.lucene.analysis.StopwordAnalyzerBase;
  35 import org.apache.lucene.analysis.TokenStream;
  36 import org.apache.lucene.analysis.Tokenizer;
  37 import org.apache.lucene.analysis.WordlistLoader;
  38 import org.apache.lucene.analysis.snowball.SnowballFilter;
  39 import org.apache.lucene.analysis.standard.StandardAnalyzer;
  40 import org.apache.lucene.analysis.standard.StandardFilter;
  41 import org.apache.lucene.analysis.standard.StandardTokenizer;
  42 import org.apache.lucene.util.IOUtils;
  43 import org.apache.lucene.util.Version;
  44 import org.tartarus.snowball.ext.German2Stemmer;
  45
  46 /**
  47  * {@link Analyzer} for German language.
  48  * <p>
  49  * Supports an external list of stopwords (words that
  50  * will not be indexed at all) and an external list of exclusions (word that will
  51  * not be stemmed, but indexed).
  52  * A default set of stopwords is used unless an alternative list is specified, but the
  53  * exclusion list is empty by default.
  54  * </p>
  55  *
  56  * <a name="version"/>
  57  * <p>You must specify the required {@link Version}
  58  * compatibility when creating GermanAnalyzer:
  59  * <ul>
  60  *   <li> As of 3.1, Snowball stemming is done with SnowballFilter, and
  61  *        Snowball stopwords are used by default.
  62  *   <li> As of 2.9, StopFilter preserves position
  63  *        increments
  64  * </ul>
  65  *
  66  * <p><b>NOTE</b>: This class uses the same {@link Version}
  67  * dependent settings as {@link StandardAnalyzer}.</p>
  68  */
  69 public final class GermanAnalyzer extends StopwordAnalyzerBase {
  70
  71   /**
  72    * List of typical german stopwords.
  73    * @deprecated use {@link #getDefaultStopSet()} instead
  74    */
  75   @Deprecated
  76   public final static String[] GERMAN_STOP_WORDS = {
  77     "einer", "eine", "eines", "einem", "einen",
  78     "der", "die", "das", "dass", "daß",
  79     "du", "er", "sie", "es",
  80     "was", "wer", "wie", "wir",
  81     "und", "oder", "ohne", "mit",
  82     "am", "im", "in", "aus", "auf",
  83     "ist", "sein", "war", "wird",
  84     "ihr", "ihre", "ihres",
  85     "als", "für", "von", "mit",
  86     "dich", "dir", "mich", "mir",
  87     "mein", "sein", "kein",
  88     "durch", "wegen", "wird"
  89   };
  90
  91   /** File containing default German stopwords. */
  92   public final static String DEFAULT_STOPWORD_FILE = "german_stop.txt";
  93
  94   /**
  95    * Returns a set of default German-stopwords
  96    * @return a set of default German-stopwords
  97    */
  98   public static final Set<?> getDefaultStopSet(){
  99     return DefaultSetHolder.DEFAULT_SET;
 100   }
 101
 102   private static class DefaultSetHolder {
 103     /** @deprecated remove in Lucene 5.0 */
 104     @Deprecated
 105     private static final Set<?> DEFAULT_SET_30 = CharArraySet.unmodifiableSet(new CharArraySet(
 106         Version.LUCENE_CURRENT, Arrays.asList(GERMAN_STOP_WORDS), false));
 107     private static final Set<?> DEFAULT_SET;
 108     static {
 109       try {
 110         DEFAULT_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
 111             DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
 112       } catch (IOException ex) {
 113         // default set should always be present as it is part of the
 114         // distribution (JAR)
 115         throw new RuntimeException("Unable to load default stopword set");
 116       }
 117     }
 118   }
 119
 120   /**
 121    * Contains the stopwords used with the {@link StopFilter}.
 122    */
 123
 124   /**
 125    * Contains words that should be indexed but not stemmed.
 126    */
 127   // TODO make this final in 3.1
 128   private Set<?> exclusionSet;
 129
 130   /**
 131    * Builds an analyzer with the default stop words:
 132    * {@link #getDefaultStopSet()}.
 133    */
 134   public GermanAnalyzer(Version matchVersion) {
 135     this(matchVersion,
 136         matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_SET
 137             : DefaultSetHolder.DEFAULT_SET_30);
 138   }
 139
 140   /**
 141    * Builds an analyzer with the given stop words
 142    *
 143    * @param matchVersion
 144    *          lucene compatibility version
 145    * @param stopwords
 146    *          a stopword set
 147    */
 148   public GermanAnalyzer(Version matchVersion, Set<?> stopwords) {
 149     this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
 150   }
 151
 152   /**
 153    * Builds an analyzer with the given stop words
 154    *
 155    * @param matchVersion
 156    *          lucene compatibility version
 157    * @param stopwords
 158    *          a stopword set
 159    * @param stemExclusionSet
 160    *          a stemming exclusion set
 161    */
 162   public GermanAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
 163     super(matchVersion, stopwords);
 164     exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
 165   }
 166
 167   /**
 168    * Builds an analyzer with the given stop words.
 169    * @deprecated use {@link #GermanAnalyzer(Version, Set)}
 170    */
 171   @Deprecated
 172   public GermanAnalyzer(Version matchVersion, String... stopwords) {
 173     this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
 174   }
 175
 176   /**
 177    * Builds an analyzer with the given stop words.
 178    * @deprecated use {@link #GermanAnalyzer(Version, Set)}
 179    */
 180   @Deprecated
 181   public GermanAnalyzer(Version matchVersion, Map<?,?> stopwords) {
 182     this(matchVersion, stopwords.keySet());
 183
 184   }
 185
 186   /**
 187    * Builds an analyzer with the given stop words.
 188    * @deprecated use {@link #GermanAnalyzer(Version, Set)}
 189    */
 190   @Deprecated
 191   public GermanAnalyzer(Version matchVersion, File stopwords) throws IOException {
 192     this(matchVersion, WordlistLoader.getWordSet(
 193         IOUtils.getDecodingReader(stopwords, IOUtils.CHARSET_UTF_8), matchVersion));
 194   }
 195
 196   /**
 197    * Builds an exclusionlist from an array of Strings.
 198    * @deprecated use {@link #GermanAnalyzer(Version, Set, Set)} instead
 199    */
 200   @Deprecated
 201   public void setStemExclusionTable(String[] exclusionlist) {
 202     exclusionSet = StopFilter.makeStopSet(matchVersion, exclusionlist);
 203     setPreviousTokenStream(null); // force a new stemmer to be created
 204   }
 205
 206   /**
 207    * Builds an exclusionlist from a {@link Map}
 208    * @deprecated use {@link #GermanAnalyzer(Version, Set, Set)} instead
 209    */
 210   @Deprecated
 211   public void setStemExclusionTable(Map<?,?> exclusionlist) {
 212     exclusionSet = new HashSet<Object>(exclusionlist.keySet());
 213     setPreviousTokenStream(null); // force a new stemmer to be created
 214   }
 215
 216   /**
 217    * Builds an exclusionlist from the words contained in the given file.
 218    * @deprecated use {@link #GermanAnalyzer(Version, Set, Set)} instead
 219    */
 220   @Deprecated
 221   public void setStemExclusionTable(File exclusionlist) throws IOException {
 222     exclusionSet = WordlistLoader.getWordSet(IOUtils.getDecodingReader(exclusionlist,
 223         IOUtils.CHARSET_UTF_8), matchVersion);
 224     setPreviousTokenStream(null); // force a new stemmer to be created
 225   }
 226
 227   /**
 228    * Creates
 229    * {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
 230    * used to tokenize all the text in the provided {@link Reader}.
 231    *
 232    * @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
 233    *         built from a {@link StandardTokenizer} filtered with
 234    *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
 235    *         , {@link KeywordMarkerFilter} if a stem exclusion set is
 236    *         provided, and {@link SnowballFilter}
 237    */
 238   @Override
 239   protected TokenStreamComponents createComponents(String fieldName,
 240       Reader reader) {
 241     final Tokenizer source = new StandardTokenizer(matchVersion, reader);
 242     TokenStream result = new StandardFilter(matchVersion, source);
 243     result = new LowerCaseFilter(matchVersion, result);
 244     result = new StopFilter( matchVersion, result, stopwords);
 245     result = new KeywordMarkerFilter(result, exclusionSet);
 246     if (matchVersion.onOrAfter(Version.LUCENE_31))
 247       result = new SnowballFilter(result, new German2Stemmer());
 248     else
 249       result = new GermanStemFilter(result);
 250     return new TokenStreamComponents(source, result);
 251   }
 252 }