1 package org.apache.lucene.analysis.cz;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import org.apache.lucene.analysis.ReusableAnalyzerBase;
21 import org.apache.lucene.analysis.Analyzer;
22 import org.apache.lucene.analysis.CharArraySet;
23 import org.apache.lucene.analysis.KeywordMarkerFilter;
24 import org.apache.lucene.analysis.LowerCaseFilter;
25 import org.apache.lucene.analysis.StopFilter;
26 import org.apache.lucene.analysis.TokenStream;
27 import org.apache.lucene.analysis.Tokenizer;
28 import org.apache.lucene.analysis.WordlistLoader;
29 import org.apache.lucene.analysis.standard.StandardFilter;
30 import org.apache.lucene.analysis.standard.StandardTokenizer;
31 import org.apache.lucene.util.Version;
34 import java.util.Collections;
35 import java.util.HashSet;
39 * {@link Analyzer} for Czech language.
41 * Supports an external list of stopwords (words that will not be indexed at
42 * all). A default set of stopwords is used unless an alternative list is
48 * You must specify the required {@link Version} compatibility when creating
51 * <li>As of 3.1, words are stemmed with {@link CzechStemFilter}
52 * <li>As of 2.9, StopFilter preserves position increments
53 * <li>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see
54 * <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
57 public final class CzechAnalyzer extends ReusableAnalyzerBase {
60 * List of typical stopwords.
61 * @deprecated use {@link #getDefaultStopSet()} instead
64 public final static String[] CZECH_STOP_WORDS = {
65 "a","s","k","o","i","u","v","z","dnes","cz","t\u00edmto","bude\u0161","budem",
66 "byli","jse\u0161","m\u016fj","sv\u00fdm","ta","tomto","tohle","tuto","tyto",
67 "jej","zda","pro\u010d","m\u00e1te","tato","kam","tohoto","kdo","kte\u0159\u00ed",
68 "mi","n\u00e1m","tom","tomuto","m\u00edt","nic","proto","kterou","byla",
69 "toho","proto\u017ee","asi","ho","na\u0161i","napi\u0161te","re","co\u017e","t\u00edm",
70 "tak\u017ee","sv\u00fdch","jej\u00ed","sv\u00fdmi","jste","aj","tu","tedy","teto",
71 "bylo","kde","ke","prav\u00e9","ji","nad","nejsou","\u010di","pod","t\u00e9ma",
72 "mezi","p\u0159es","ty","pak","v\u00e1m","ani","kdy\u017e","v\u0161ak","neg","jsem",
73 "tento","\u010dl\u00e1nku","\u010dl\u00e1nky","aby","jsme","p\u0159ed","pta","jejich",
74 "byl","je\u0161t\u011b","a\u017e","bez","tak\u00e9","pouze","prvn\u00ed","va\u0161e","kter\u00e1",
75 "n\u00e1s","nov\u00fd","tipy","pokud","m\u016f\u017ee","strana","jeho","sv\u00e9","jin\u00e9",
76 "zpr\u00e1vy","nov\u00e9","nen\u00ed","v\u00e1s","jen","podle","zde","u\u017e","b\u00fdt","v\u00edce",
77 "bude","ji\u017e","ne\u017e","kter\u00fd","by","kter\u00e9","co","nebo","ten","tak",
78 "m\u00e1","p\u0159i","od","po","jsou","jak","dal\u0161\u00ed","ale","si","se","ve",
79 "to","jako","za","zp\u011bt","ze","do","pro","je","na","atd","atp",
80 "jakmile","p\u0159i\u010dem\u017e","j\u00e1","on","ona","ono","oni","ony","my","vy",
81 "j\u00ed","ji","m\u011b","mne","jemu","tomu","t\u011bm","t\u011bmu","n\u011bmu","n\u011bmu\u017e",
82 "jeho\u017e","j\u00ed\u017e","jeliko\u017e","je\u017e","jako\u017e","na\u010de\u017e",
85 /** File containing default Czech stopwords. */
86 public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
89 * Returns a set of default Czech-stopwords
91 * @return a set of default Czech-stopwords
93 public static final Set<?> getDefaultStopSet(){
94 return DefaultSetHolder.DEFAULT_SET;
97 private static class DefaultSetHolder {
98 private static final Set<?> DEFAULT_SET;
102 DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet(
103 Version.LUCENE_CURRENT, WordlistLoader.getWordSet(CzechAnalyzer.class,
104 DEFAULT_STOPWORD_FILE, "#"), false));
105 } catch (IOException ex) {
106 // default set should always be present as it is part of the
107 // distribution (JAR)
108 throw new RuntimeException("Unable to load default stopword set");
115 * Contains the stopwords used with the {@link StopFilter}.
117 // TODO once loadStopWords is gone those member should be removed too in favor of StopwordAnalyzerBase
118 private Set<?> stoptable;
119 private final Version matchVersion;
120 private final Set<?> stemExclusionTable;
123 * Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}).
125 * @param matchVersion Lucene version to match See
126 * {@link <a href="#version">above</a>}
128 public CzechAnalyzer(Version matchVersion) {
129 this(matchVersion, DefaultSetHolder.DEFAULT_SET);
133 * Builds an analyzer with the given stop words.
135 * @param matchVersion Lucene version to match See
136 * {@link <a href="#version">above</a>}
137 * @param stopwords a stopword set
139 public CzechAnalyzer(Version matchVersion, Set<?> stopwords) {
140 this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
144 * Builds an analyzer with the given stop words and a set of work to be
145 * excluded from the {@link CzechStemFilter}.
147 * @param matchVersion Lucene version to match See
148 * {@link <a href="#version">above</a>}
149 * @param stopwords a stopword set
150 * @param stemExclusionTable a stemming exclusion set
152 public CzechAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionTable) {
153 this.matchVersion = matchVersion;
154 this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
155 this.stemExclusionTable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
160 * Builds an analyzer with the given stop words.
162 * @param matchVersion Lucene version to match See
163 * {@link <a href="#version">above</a>}
164 * @param stopwords a stopword set
165 * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
168 public CzechAnalyzer(Version matchVersion, String... stopwords) {
169 this(matchVersion, StopFilter.makeStopSet( matchVersion, stopwords ));
173 * Builds an analyzer with the given stop words.
175 * @param matchVersion Lucene version to match See
176 * {@link <a href="#version">above</a>}
177 * @param stopwords a stopword set
178 * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
181 public CzechAnalyzer(Version matchVersion, HashSet<?> stopwords) {
182 this(matchVersion, (Set<?>)stopwords);
186 * Builds an analyzer with the given stop words.
188 * @param matchVersion Lucene version to match See
189 * {@link <a href="#version">above</a>}
190 * @param stopwords a file containing stopwords
191 * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
194 public CzechAnalyzer(Version matchVersion, File stopwords ) throws IOException {
195 this(matchVersion, (Set<?>)WordlistLoader.getWordSet( stopwords ));
199 * Loads stopwords hash from resource stream (file, database...).
200 * @param wordfile File containing the wordlist
201 * @param encoding Encoding used (win-1250, iso-8859-2, ...), null for default system encoding
202 * @deprecated use {@link WordlistLoader#getWordSet(Reader, String) }
203 * and {@link #CzechAnalyzer(Version, Set)} instead
205 // TODO extend StopwordAnalyzerBase once this method is gone!
207 public void loadStopWords( InputStream wordfile, String encoding ) {
208 setPreviousTokenStream(null); // force a new stopfilter to be created
209 if ( wordfile == null ) {
210 stoptable = Collections.emptySet();
214 // clear any previous table (if present)
215 stoptable = Collections.emptySet();
217 InputStreamReader isr;
218 if (encoding == null)
219 isr = new InputStreamReader(wordfile);
221 isr = new InputStreamReader(wordfile, encoding);
223 stoptable = WordlistLoader.getWordSet(isr);
224 } catch ( IOException e ) {
225 // clear any previous table (if present)
226 // TODO: throw IOException
227 stoptable = Collections.emptySet();
233 * {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
234 * used to tokenize all the text in the provided {@link Reader}.
236 * @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
237 * built from a {@link StandardTokenizer} filtered with
238 * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
239 * , and {@link CzechStemFilter} (only if version is >= LUCENE_31). If
240 * a version is >= LUCENE_31 and a stem exclusion set is provided via
241 * {@link #CzechAnalyzer(Version, Set, Set)} a
242 * {@link KeywordMarkerFilter} is added before
243 * {@link CzechStemFilter}.
246 protected TokenStreamComponents createComponents(String fieldName,
248 final Tokenizer source = new StandardTokenizer(matchVersion, reader);
249 TokenStream result = new StandardFilter(matchVersion, source);
250 result = new LowerCaseFilter(matchVersion, result);
251 result = new StopFilter( matchVersion, result, stoptable);
252 if (matchVersion.onOrAfter(Version.LUCENE_31)) {
253 if(!this.stemExclusionTable.isEmpty())
254 result = new KeywordMarkerFilter(result, stemExclusionTable);
255 result = new CzechStemFilter(result);
257 return new TokenStreamComponents(source, result);