X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java?ds=sidebyside diff --git a/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java new file mode 100644 index 0000000..895fbb3 --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java @@ -0,0 +1,129 @@ +package org.apache.lucene.analysis.snowball; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.en.EnglishPossessiveFilter; +import org.apache.lucene.analysis.standard.*; +import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; +import org.apache.lucene.util.Version; + +import java.io.IOException; +import java.io.Reader; +import java.util.Set; + +/** Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link + * LowerCaseFilter}, {@link StopFilter} and {@link SnowballFilter}. + * + * Available stemmers are listed in org.tartarus.snowball.ext. The name of a + * stemmer is the part of the class name before "Stemmer", e.g., the stemmer in + * {@link org.tartarus.snowball.ext.EnglishStemmer} is named "English". + * + *

NOTE: This class uses the same {@link Version} + * dependent settings as {@link StandardAnalyzer}, with the following addition: + *

+ *

+ * @deprecated Use the language-specific analyzer in contrib/analyzers instead. + * This analyzer will be removed in Lucene 5.0 + */ +@Deprecated +public final class SnowballAnalyzer extends Analyzer { + private String name; + private Set stopSet; + private final Version matchVersion; + + /** Builds the named analyzer with no stop words. */ + public SnowballAnalyzer(Version matchVersion, String name) { + this.name = name; + this.matchVersion = matchVersion; + } + + /** + * Builds the named analyzer with the given stop words. + * @deprecated Use {@link #SnowballAnalyzer(Version, String, Set)} instead. + */ + @Deprecated + public SnowballAnalyzer(Version matchVersion, String name, String[] stopWords) { + this(matchVersion, name); + stopSet = StopFilter.makeStopSet(matchVersion, stopWords); + } + + /** Builds the named analyzer with the given stop words. */ + public SnowballAnalyzer(Version matchVersion, String name, Set stopWords) { + this(matchVersion, name); + stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, + stopWords)); + } + + /** Constructs a {@link StandardTokenizer} filtered by a {@link + StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter}, + and a {@link SnowballFilter} */ + @Override + public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream result = new StandardTokenizer(matchVersion, reader); + result = new StandardFilter(matchVersion, result); + // remove the possessive 's for english stemmers + if (matchVersion.onOrAfter(Version.LUCENE_31) && + (name.equals("English") || name.equals("Porter") || name.equals("Lovins"))) + result = new EnglishPossessiveFilter(result); + // Use a special lowercase filter for turkish, the stemmer expects it. + if (matchVersion.onOrAfter(Version.LUCENE_31) && name.equals("Turkish")) + result = new TurkishLowerCaseFilter(result); + else + result = new LowerCaseFilter(matchVersion, result); + if (stopSet != null) + result = new StopFilter(matchVersion, + result, stopSet); + result = new SnowballFilter(result, name); + return result; + } + + private class SavedStreams { + Tokenizer source; + TokenStream result; + } + + /** Returns a (possibly reused) {@link StandardTokenizer} filtered by a + * {@link StandardFilter}, a {@link LowerCaseFilter}, + * a {@link StopFilter}, and a {@link SnowballFilter} */ + @Override + public TokenStream reusableTokenStream(String fieldName, Reader reader) + throws IOException { + SavedStreams streams = (SavedStreams) getPreviousTokenStream(); + if (streams == null) { + streams = new SavedStreams(); + streams.source = new StandardTokenizer(matchVersion, reader); + streams.result = new StandardFilter(matchVersion, streams.source); + // Use a special lowercase filter for turkish, the stemmer expects it. + if (matchVersion.onOrAfter(Version.LUCENE_31) && name.equals("Turkish")) + streams.result = new TurkishLowerCaseFilter(streams.result); + else + streams.result = new LowerCaseFilter(matchVersion, streams.result); + if (stopSet != null) + streams.result = new StopFilter(matchVersion, + streams.result, stopSet); + streams.result = new SnowballFilter(streams.result, name); + setPreviousTokenStream(streams); + } else { + streams.source.reset(reader); + } + return streams.result; + } +}