--- /dev/null
+package org.apache.lucene.analysis.icu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+import com.ibm.icu.text.Normalizer;
+import com.ibm.icu.text.Normalizer2;
+
+/**
+ * Normalize token text with ICU's {@link com.ibm.icu.text.Normalizer2}
+ * <p>
+ * With this filter, you can normalize text in the following ways:
+ * <ul>
+ * <li> NFKC Normalization, Case Folding, and removing Ignorables (the default)
+ * <li> Using a standard Normalization mode (NFC, NFD, NFKC, NFKD)
+ * <li> Based on rules from a custom normalization mapping.
+ * </ul>
+ * <p>
+ * If you use the defaults, this filter is a simple way to standardize Unicode text
+ * in a language-independent way for search:
+ * <ul>
+ * <li> The case folding that it does can be seen as a replacement for
+ * LowerCaseFilter: For example, it handles cases such as the Greek sigma, so that
+ * "Μάϊος" and "ΜΆΪΟΣ" will match correctly.
+ * <li> The normalization will standardizes different forms of the same
+ * character in Unicode. For example, CJK full-width numbers will be standardized
+ * to their ASCII forms.
+ * <li> Ignorables such as Zero-Width Joiner and Variation Selectors are removed.
+ * These are typically modifier characters that affect display.
+ * </ul>
+ *
+ * @see com.ibm.icu.text.Normalizer2
+ * @see com.ibm.icu.text.FilteredNormalizer2
+ */
+public class ICUNormalizer2Filter extends TokenFilter {
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final Normalizer2 normalizer;
+ private final StringBuilder buffer = new StringBuilder();
+
+ /**
+ * Create a new Normalizer2Filter that combines NFKC normalization, Case
+ * Folding, and removes Default Ignorables (NFKC_Casefold)
+ */
+ public ICUNormalizer2Filter(TokenStream input) {
+ this(input, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
+ }
+
+ /**
+ * Create a new Normalizer2Filter with the specified Normalizer2
+ * @param input stream
+ * @param normalizer normalizer to use
+ */
+ public ICUNormalizer2Filter(TokenStream input, Normalizer2 normalizer) {
+ super(input);
+ this.normalizer = normalizer;
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (normalizer.quickCheck(termAtt) != Normalizer.YES) {
+ buffer.setLength(0);
+ normalizer.normalize(termAtt, buffer);
+ termAtt.setEmpty().append(buffer);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+}