lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java

   1 package org.apache.lucene.analysis.nl;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import org.apache.lucene.analysis.Analyzer;
  21 import org.apache.lucene.analysis.CharArrayMap;
  22 import org.apache.lucene.analysis.CharArraySet;
  23 import org.apache.lucene.analysis.KeywordMarkerFilter;
  24 import org.apache.lucene.analysis.LowerCaseFilter;
  25 import org.apache.lucene.analysis.ReusableAnalyzerBase;
  26 import org.apache.lucene.analysis.StopFilter;
  27 import org.apache.lucene.analysis.TokenStream;
  28 import org.apache.lucene.analysis.Tokenizer;
  29 import org.apache.lucene.analysis.WordlistLoader;
  30 import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
  31 import org.apache.lucene.analysis.snowball.SnowballFilter;
  32 import org.apache.lucene.analysis.standard.StandardFilter;
  33 import org.apache.lucene.analysis.standard.StandardTokenizer;
  34 import org.apache.lucene.analysis.standard.StandardAnalyzer;  // for javadoc
  35 import org.apache.lucene.util.IOUtils;
  36 import org.apache.lucene.util.Version;
  37
  38 import java.io.File;
  39 import java.io.IOException;
  40 import java.io.Reader;
  41 import java.util.Collections;
  42 import java.util.HashMap;
  43 import java.util.HashSet;
  44 import java.util.Set;
  45 import java.util.Map;
  46
  47 /**
  48  * {@link Analyzer} for Dutch language.
  49  * <p>
  50  * Supports an external list of stopwords (words that
  51  * will not be indexed at all), an external list of exclusions (word that will
  52  * not be stemmed, but indexed) and an external list of word-stem pairs that overrule
  53  * the algorithm (dictionary stemming).
  54  * A default set of stopwords is used unless an alternative list is specified, but the
  55  * exclusion list is empty by default.
  56  * </p>
  57  *
  58  * <a name="version"/>
  59  * <p>You must specify the required {@link Version}
  60  * compatibility when creating DutchAnalyzer:
  61  * <ul>
  62  *   <li> As of 3.1, Snowball stemming is done with SnowballFilter,
  63  *        LowerCaseFilter is used prior to StopFilter, and Snowball
  64  *        stopwords are used by default.
  65  *   <li> As of 2.9, StopFilter preserves position
  66  *        increments
  67  * </ul>
  68  *
  69  * <p><b>NOTE</b>: This class uses the same {@link Version}
  70  * dependent settings as {@link StandardAnalyzer}.</p>
  71  */
  72 public final class DutchAnalyzer extends ReusableAnalyzerBase {
  73   /**
  74    * List of typical Dutch stopwords.
  75    * @deprecated use {@link #getDefaultStopSet()} instead
  76    */
  77   @Deprecated
  78   public final static String[] DUTCH_STOP_WORDS;
  79
  80   /** File containing default Dutch stopwords. */
  81   public final static String DEFAULT_STOPWORD_FILE = "dutch_stop.txt";
  82
  83   static {
  84     Set<?> defaultStopSet =  getDefaultStopSet();
  85     DUTCH_STOP_WORDS = new String[defaultStopSet.size()];
  86     int i = 0;
  87     for (Object object: defaultStopSet) {
  88       DUTCH_STOP_WORDS[i++] = new String((char[])object);
  89     } // what a hack!
  90   }
  91
  92   /**
  93    * Returns an unmodifiable instance of the default stop-words set.
  94    * @return an unmodifiable instance of the default stop-words set.
  95    */
  96   public static Set<?> getDefaultStopSet(){
  97     return DefaultSetHolder.DEFAULT_STOP_SET;
  98   }
  99
 100   private static class DefaultSetHolder {
 101     static final Set<?> DEFAULT_STOP_SET;
 102
 103     static {
 104       try {
 105         DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
 106             DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
 107       } catch (IOException ex) {
 108         // default set should always be present as it is part of the
 109         // distribution (JAR)
 110         throw new RuntimeException("Unable to load default stopword set");
 111       }
 112     }
 113
 114   }
 115
 116
 117   /**
 118    * Contains the stopwords used with the StopFilter.
 119    */
 120   private final Set<?> stoptable;
 121
 122   /**
 123    * Contains words that should be indexed but not stemmed.
 124    */
 125   private Set<?> excltable = Collections.emptySet();
 126
 127   private Map<Object,String>  stemdict = CharArrayMap.emptyMap();
 128   private final Version matchVersion;
 129
 130   /**
 131    * Builds an analyzer with the default stop words ({@link #getDefaultStopSet()})
 132    * and a few default entries for the stem exclusion table.
 133    *
 134    */
 135   public DutchAnalyzer(Version matchVersion) {
 136     this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
 137     stemdict = new CharArrayMap<String>(matchVersion, 16, false);
 138     stemdict.put("fiets", "fiets"); //otherwise fiet
 139     stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
 140     stemdict.put("ei", "eier");
 141     stemdict.put("kind", "kinder");
 142   }
 143
 144   public DutchAnalyzer(Version matchVersion, Set<?> stopwords){
 145     this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
 146   }
 147
 148   public DutchAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionTable){
 149     stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
 150     excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
 151     this.matchVersion = matchVersion;
 152   }
 153
 154   /**
 155    * Builds an analyzer with the given stop words.
 156    *
 157    * @param matchVersion
 158    * @param stopwords
 159    * @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
 160    */
 161   @Deprecated
 162   public DutchAnalyzer(Version matchVersion, String... stopwords) {
 163     this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
 164   }
 165
 166   /**
 167    * Builds an analyzer with the given stop words.
 168    *
 169    * @param stopwords
 170    * @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
 171    */
 172   @Deprecated
 173   public DutchAnalyzer(Version matchVersion, HashSet<?> stopwords) {
 174     this(matchVersion, (Set<?>)stopwords);
 175   }
 176
 177   /**
 178    * Builds an analyzer with the given stop words.
 179    *
 180    * @param stopwords
 181    * @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
 182    */
 183   @Deprecated
 184   public DutchAnalyzer(Version matchVersion, File stopwords) {
 185     // this is completely broken!
 186     try {
 187       stoptable = WordlistLoader.getWordSet(IOUtils.getDecodingReader(stopwords,
 188           IOUtils.CHARSET_UTF_8), matchVersion);
 189     } catch (IOException e) {
 190       // TODO: throw IOException
 191       throw new RuntimeException(e);
 192     }
 193     this.matchVersion = matchVersion;
 194   }
 195
 196   /**
 197    * Builds an exclusionlist from an array of Strings.
 198    *
 199    * @param exclusionlist
 200    * @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
 201    */
 202   @Deprecated
 203   public void setStemExclusionTable(String... exclusionlist) {
 204     excltable = StopFilter.makeStopSet(matchVersion, exclusionlist);
 205     setPreviousTokenStream(null); // force a new stemmer to be created
 206   }
 207
 208   /**
 209    * Builds an exclusionlist from a Hashtable.
 210    * @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
 211    */
 212   @Deprecated
 213   public void setStemExclusionTable(HashSet<?> exclusionlist) {
 214     excltable = exclusionlist;
 215     setPreviousTokenStream(null); // force a new stemmer to be created
 216   }
 217
 218   /**
 219    * Builds an exclusionlist from the words contained in the given file.
 220    * @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
 221    */
 222   @Deprecated
 223   public void setStemExclusionTable(File exclusionlist) {
 224     try {
 225
 226       excltable = WordlistLoader.getWordSet(IOUtils.getDecodingReader(exclusionlist,
 227           IOUtils.CHARSET_UTF_8), matchVersion);
 228       setPreviousTokenStream(null); // force a new stemmer to be created
 229     } catch (IOException e) {
 230       // TODO: throw IOException
 231       throw new RuntimeException(e);
 232     }
 233   }
 234
 235   /**
 236    * Reads a stemdictionary file , that overrules the stemming algorithm
 237    * This is a textfile that contains per line
 238    * <tt>word<b>\t</b>stem</tt>, i.e: two tab seperated words
 239    * @deprecated This prevents reuse of TokenStreams.  If you wish to use a custom
 240    * stem dictionary, create your own Analyzer with {@link StemmerOverrideFilter}
 241    */
 242   @Deprecated
 243   public void setStemDictionary(File stemdictFile) {
 244     try {
 245       stemdict = WordlistLoader.getStemDict(IOUtils.getDecodingReader(stemdictFile,
 246           IOUtils.CHARSET_UTF_8), new CharArrayMap<String>(matchVersion, 16, false));
 247       setPreviousTokenStream(null); // force a new stemmer to be created
 248     } catch (IOException e) {
 249       // TODO: throw IOException
 250       throw new RuntimeException(e);
 251     }
 252   }
 253
 254
 255   /**
 256    * Returns a (possibly reused) {@link TokenStream} which tokenizes all the
 257    * text in the provided {@link Reader}.
 258    *
 259    * @return A {@link TokenStream} built from a {@link StandardTokenizer}
 260    *   filtered with {@link StandardFilter}, {@link LowerCaseFilter},
 261    *   {@link StopFilter}, {@link KeywordMarkerFilter} if a stem exclusion set is provided,
 262    *   {@link StemmerOverrideFilter}, and {@link SnowballFilter}
 263    */
 264   @Override
 265   protected TokenStreamComponents createComponents(String fieldName,
 266       Reader aReader) {
 267     if (matchVersion.onOrAfter(Version.LUCENE_31)) {
 268       final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
 269       TokenStream result = new StandardFilter(matchVersion, source);
 270       result = new LowerCaseFilter(matchVersion, result);
 271       result = new StopFilter(matchVersion, result, stoptable);
 272       if (!excltable.isEmpty())
 273         result = new KeywordMarkerFilter(result, excltable);
 274       if (!stemdict.isEmpty())
 275         result = new StemmerOverrideFilter(matchVersion, result, stemdict);
 276       result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer());
 277       return new TokenStreamComponents(source, result);
 278     } else {
 279       final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
 280       TokenStream result = new StandardFilter(matchVersion, source);
 281       result = new StopFilter(matchVersion, result, stoptable);
 282       if (!excltable.isEmpty())
 283         result = new KeywordMarkerFilter(result, excltable);
 284       result = new DutchStemFilter(result, stemdict);
 285       return new TokenStreamComponents(source, result);
 286     }
 287   }
 288 }