X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java diff --git a/lucene-java-3.5.0/lucene/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java b/lucene-java-3.5.0/lucene/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java new file mode 100644 index 0000000..a9bce86 --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java @@ -0,0 +1,178 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.cn.smart; + +import java.io.IOException; +import java.io.Reader; +import java.util.Collections; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.PorterStemFilter; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WordlistLoader; +import org.apache.lucene.analysis.cn.smart.SentenceTokenizer; +import org.apache.lucene.analysis.cn.smart.WordTokenFilter; +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.Version; + +/** + *

+ * SmartChineseAnalyzer is an analyzer for Chinese or mixed Chinese-English text. + * The analyzer uses probabilistic knowledge to find the optimal word segmentation for Simplified Chinese text. + * The text is first broken into sentences, then each sentence is segmented into words. + *

+ *

+ * Segmentation is based upon the Hidden Markov Model. + * A large training corpus was used to calculate Chinese word frequency probability. + *

+ *

+ * This analyzer requires a dictionary to provide statistical data. + * SmartChineseAnalyzer has an included dictionary out-of-box. + *

+ *

+ * The included dictionary data is from ICTCLAS1.0. + * Thanks to ICTCLAS for their hard work, and for contributing the data under the Apache 2 License! + *

+ * @lucene.experimental + */ +public final class SmartChineseAnalyzer extends Analyzer { + + private final Set stopWords; + + private static final String DEFAULT_STOPWORD_FILE = "stopwords.txt"; + + private static final String STOPWORD_FILE_COMMENT = "//"; + + /** + * Returns an unmodifiable instance of the default stop-words set. + * @return an unmodifiable instance of the default stop-words set. + */ + public static CharArraySet getDefaultStopSet(){ + return DefaultSetHolder.DEFAULT_STOP_SET; + } + + /** + * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class + * accesses the static final set the first time.; + */ + private static class DefaultSetHolder { + static final CharArraySet DEFAULT_STOP_SET; + + static { + try { + DEFAULT_STOP_SET = loadDefaultStopWordSet(); + } catch (IOException ex) { + // default set should always be present as it is part of the + // distribution (JAR) + throw new RuntimeException("Unable to load default stopword set"); + } + } + + static CharArraySet loadDefaultStopWordSet() throws IOException { + // make sure it is unmodifiable as we expose it in the outer class + return org.apache.lucene.analysis.CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(IOUtils + .getDecodingReader(SmartChineseAnalyzer.class, DEFAULT_STOPWORD_FILE, + IOUtils.CHARSET_UTF_8), STOPWORD_FILE_COMMENT, + Version.LUCENE_CURRENT)); + } + } + + private final Version matchVersion; + + /** + * Create a new SmartChineseAnalyzer, using the default stopword list. + */ + public SmartChineseAnalyzer(Version matchVersion) { + this(matchVersion, true); + } + + /** + *

+ * Create a new SmartChineseAnalyzer, optionally using the default stopword list. + *

+ *

+ * The included default stopword list is simply a list of punctuation. + * If you do not use this list, punctuation will not be removed from the text! + *

+ * + * @param useDefaultStopWords true to use the default stopword list. + */ + public SmartChineseAnalyzer(Version matchVersion, boolean useDefaultStopWords) { + stopWords = useDefaultStopWords ? DefaultSetHolder.DEFAULT_STOP_SET + : Collections.EMPTY_SET; + this.matchVersion = matchVersion; + } + + /** + *

+ * Create a new SmartChineseAnalyzer, using the provided {@link Set} of stopwords. + *

+ *

+ * Note: the set should include punctuation, unless you want to index punctuation! + *

+ * @param stopWords {@link Set} of stopwords to use. + */ + public SmartChineseAnalyzer(Version matchVersion, Set stopWords) { + this.stopWords = stopWords==null?Collections.EMPTY_SET:stopWords; + this.matchVersion = matchVersion; + } + + @Override + public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream result = new SentenceTokenizer(reader); + result = new WordTokenFilter(result); + // result = new LowerCaseFilter(result); + // LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text. + // The porter stemming is too strict, this is not a bug, this is a feature:) + result = new PorterStemFilter(result); + if (!stopWords.isEmpty()) { + result = new StopFilter(matchVersion, result, stopWords, false); + } + return result; + } + + private static final class SavedStreams { + Tokenizer tokenStream; + TokenStream filteredTokenStream; + } + + @Override + public TokenStream reusableTokenStream(String fieldName, Reader reader) + throws IOException { + SavedStreams streams = (SavedStreams) getPreviousTokenStream(); + if (streams == null) { + streams = new SavedStreams(); + setPreviousTokenStream(streams); + streams.tokenStream = new SentenceTokenizer(reader); + streams.filteredTokenStream = new WordTokenFilter(streams.tokenStream); + streams.filteredTokenStream = new PorterStemFilter(streams.filteredTokenStream); + if (!stopWords.isEmpty()) { + streams.filteredTokenStream = new StopFilter(matchVersion, streams.filteredTokenStream, stopWords, false); + } + } else { + streams.tokenStream.reset(reader); + streams.filteredTokenStream.reset(); // reset WordTokenFilter's state + } + + return streams.filteredTokenStream; + } +}