--- /dev/null
+package org.apache.lucene.analysis.cn;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.Version;
+
+/**
+ * A {@link TokenFilter} with a stop word table.
+ * <ul>
+ * <li>Numeric tokens are removed.
+ * <li>English tokens must be larger than 1 character.
+ * <li>One Chinese character as one Chinese word.
+ * </ul>
+ * TO DO:
+ * <ol>
+ * <li>Add Chinese stop words, such as \ue400
+ * <li>Dictionary based Chinese word extraction
+ * <li>Intelligent Chinese word extraction
+ * </ol>
+ *
+ * @version 1.0
+ * @deprecated Use {@link StopFilter} instead, which has the same functionality.
+ * This filter will be removed in Lucene 5.0
+ */
+@Deprecated
+public final class ChineseFilter extends TokenFilter {
+
+
+ // Only English now, Chinese to be added later.
+ public static final String[] STOP_WORDS = {
+ "and", "are", "as", "at", "be", "but", "by",
+ "for", "if", "in", "into", "is", "it",
+ "no", "not", "of", "on", "or", "such",
+ "that", "the", "their", "then", "there", "these",
+ "they", "this", "to", "was", "will", "with"
+ };
+
+
+ private CharArraySet stopTable;
+
+ private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+ public ChineseFilter(TokenStream in) {
+ super(in);
+
+ stopTable = new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(STOP_WORDS), false);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+
+ while (input.incrementToken()) {
+ char text[] = termAtt.buffer();
+ int termLength = termAtt.length();
+
+ // why not key off token type here assuming ChineseTokenizer comes first?
+ if (!stopTable.contains(text, 0, termLength)) {
+ switch (Character.getType(text[0])) {
+
+ case Character.LOWERCASE_LETTER:
+ case Character.UPPERCASE_LETTER:
+
+ // English word/token should larger than 1 character.
+ if (termLength>1) {
+ return true;
+ }
+ break;
+ case Character.OTHER_LETTER:
+
+ // One Chinese character as one Chinese word.
+ // Chinese word extraction to be added later here.
+
+ return true;
+ }
+
+ }
+
+ }
+ return false;
+ }
+
+}
\ No newline at end of file