1 package org.apache.lucene.analysis.nl;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import org.apache.lucene.analysis.Analyzer;
21 import org.apache.lucene.analysis.CharArrayMap;
22 import org.apache.lucene.analysis.CharArraySet;
23 import org.apache.lucene.analysis.KeywordMarkerFilter;
24 import org.apache.lucene.analysis.LowerCaseFilter;
25 import org.apache.lucene.analysis.ReusableAnalyzerBase;
26 import org.apache.lucene.analysis.StopFilter;
27 import org.apache.lucene.analysis.TokenStream;
28 import org.apache.lucene.analysis.Tokenizer;
29 import org.apache.lucene.analysis.WordlistLoader;
30 import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
31 import org.apache.lucene.analysis.snowball.SnowballFilter;
32 import org.apache.lucene.analysis.standard.StandardFilter;
33 import org.apache.lucene.analysis.standard.StandardTokenizer;
34 import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
35 import org.apache.lucene.util.IOUtils;
36 import org.apache.lucene.util.Version;
39 import java.io.IOException;
40 import java.io.Reader;
41 import java.util.Collections;
42 import java.util.HashMap;
43 import java.util.HashSet;
48 * {@link Analyzer} for Dutch language.
50 * Supports an external list of stopwords (words that
51 * will not be indexed at all), an external list of exclusions (word that will
52 * not be stemmed, but indexed) and an external list of word-stem pairs that overrule
53 * the algorithm (dictionary stemming).
54 * A default set of stopwords is used unless an alternative list is specified, but the
55 * exclusion list is empty by default.
59 * <p>You must specify the required {@link Version}
60 * compatibility when creating DutchAnalyzer:
62 * <li> As of 3.1, Snowball stemming is done with SnowballFilter,
63 * LowerCaseFilter is used prior to StopFilter, and Snowball
64 * stopwords are used by default.
65 * <li> As of 2.9, StopFilter preserves position
69 * <p><b>NOTE</b>: This class uses the same {@link Version}
70 * dependent settings as {@link StandardAnalyzer}.</p>
72 public final class DutchAnalyzer extends ReusableAnalyzerBase {
74 * List of typical Dutch stopwords.
75 * @deprecated use {@link #getDefaultStopSet()} instead
78 public final static String[] DUTCH_STOP_WORDS;
80 /** File containing default Dutch stopwords. */
81 public final static String DEFAULT_STOPWORD_FILE = "dutch_stop.txt";
84 Set<?> defaultStopSet = getDefaultStopSet();
85 DUTCH_STOP_WORDS = new String[defaultStopSet.size()];
87 for (Object object: defaultStopSet) {
88 DUTCH_STOP_WORDS[i++] = new String((char[])object);
93 * Returns an unmodifiable instance of the default stop-words set.
94 * @return an unmodifiable instance of the default stop-words set.
96 public static Set<?> getDefaultStopSet(){
97 return DefaultSetHolder.DEFAULT_STOP_SET;
100 private static class DefaultSetHolder {
101 static final Set<?> DEFAULT_STOP_SET;
105 DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
106 DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
107 } catch (IOException ex) {
108 // default set should always be present as it is part of the
109 // distribution (JAR)
110 throw new RuntimeException("Unable to load default stopword set");
118 * Contains the stopwords used with the StopFilter.
120 private final Set<?> stoptable;
123 * Contains words that should be indexed but not stemmed.
125 private Set<?> excltable = Collections.emptySet();
127 private Map<Object,String> stemdict = CharArrayMap.emptyMap();
128 private final Version matchVersion;
131 * Builds an analyzer with the default stop words ({@link #getDefaultStopSet()})
132 * and a few default entries for the stem exclusion table.
135 public DutchAnalyzer(Version matchVersion) {
136 this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
137 stemdict = new CharArrayMap<String>(matchVersion, 16, false);
138 stemdict.put("fiets", "fiets"); //otherwise fiet
139 stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
140 stemdict.put("ei", "eier");
141 stemdict.put("kind", "kinder");
144 public DutchAnalyzer(Version matchVersion, Set<?> stopwords){
145 this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
148 public DutchAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionTable){
149 stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
150 excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
151 this.matchVersion = matchVersion;
155 * Builds an analyzer with the given stop words.
157 * @param matchVersion
159 * @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
162 public DutchAnalyzer(Version matchVersion, String... stopwords) {
163 this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
167 * Builds an analyzer with the given stop words.
170 * @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
173 public DutchAnalyzer(Version matchVersion, HashSet<?> stopwords) {
174 this(matchVersion, (Set<?>)stopwords);
178 * Builds an analyzer with the given stop words.
181 * @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
184 public DutchAnalyzer(Version matchVersion, File stopwords) {
185 // this is completely broken!
187 stoptable = WordlistLoader.getWordSet(IOUtils.getDecodingReader(stopwords,
188 IOUtils.CHARSET_UTF_8), matchVersion);
189 } catch (IOException e) {
190 // TODO: throw IOException
191 throw new RuntimeException(e);
193 this.matchVersion = matchVersion;
197 * Builds an exclusionlist from an array of Strings.
199 * @param exclusionlist
200 * @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
203 public void setStemExclusionTable(String... exclusionlist) {
204 excltable = StopFilter.makeStopSet(matchVersion, exclusionlist);
205 setPreviousTokenStream(null); // force a new stemmer to be created
209 * Builds an exclusionlist from a Hashtable.
210 * @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
213 public void setStemExclusionTable(HashSet<?> exclusionlist) {
214 excltable = exclusionlist;
215 setPreviousTokenStream(null); // force a new stemmer to be created
219 * Builds an exclusionlist from the words contained in the given file.
220 * @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
223 public void setStemExclusionTable(File exclusionlist) {
226 excltable = WordlistLoader.getWordSet(IOUtils.getDecodingReader(exclusionlist,
227 IOUtils.CHARSET_UTF_8), matchVersion);
228 setPreviousTokenStream(null); // force a new stemmer to be created
229 } catch (IOException e) {
230 // TODO: throw IOException
231 throw new RuntimeException(e);
236 * Reads a stemdictionary file , that overrules the stemming algorithm
237 * This is a textfile that contains per line
238 * <tt>word<b>\t</b>stem</tt>, i.e: two tab seperated words
239 * @deprecated This prevents reuse of TokenStreams. If you wish to use a custom
240 * stem dictionary, create your own Analyzer with {@link StemmerOverrideFilter}
243 public void setStemDictionary(File stemdictFile) {
245 stemdict = WordlistLoader.getStemDict(IOUtils.getDecodingReader(stemdictFile,
246 IOUtils.CHARSET_UTF_8), new CharArrayMap<String>(matchVersion, 16, false));
247 setPreviousTokenStream(null); // force a new stemmer to be created
248 } catch (IOException e) {
249 // TODO: throw IOException
250 throw new RuntimeException(e);
256 * Returns a (possibly reused) {@link TokenStream} which tokenizes all the
257 * text in the provided {@link Reader}.
259 * @return A {@link TokenStream} built from a {@link StandardTokenizer}
260 * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
261 * {@link StopFilter}, {@link KeywordMarkerFilter} if a stem exclusion set is provided,
262 * {@link StemmerOverrideFilter}, and {@link SnowballFilter}
265 protected TokenStreamComponents createComponents(String fieldName,
267 if (matchVersion.onOrAfter(Version.LUCENE_31)) {
268 final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
269 TokenStream result = new StandardFilter(matchVersion, source);
270 result = new LowerCaseFilter(matchVersion, result);
271 result = new StopFilter(matchVersion, result, stoptable);
272 if (!excltable.isEmpty())
273 result = new KeywordMarkerFilter(result, excltable);
274 if (!stemdict.isEmpty())
275 result = new StemmerOverrideFilter(matchVersion, result, stemdict);
276 result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer());
277 return new TokenStreamComponents(source, result);
279 final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
280 TokenStream result = new StandardFilter(matchVersion, source);
281 result = new StopFilter(matchVersion, result, stoptable);
282 if (!excltable.isEmpty())
283 result = new KeywordMarkerFilter(result, excltable);
284 result = new DutchStemFilter(result, stemdict);
285 return new TokenStreamComponents(source, result);