--- /dev/null
+package org.apache.lucene.analysis.icu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+import com.ibm.icu.text.Replaceable;
+import com.ibm.icu.text.Transliterator;
+import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;
+
+/**
+ * A {@link TokenFilter} that transforms text with ICU.
+ * <p>
+ * ICU provides text-transformation functionality via its Transliteration API.
+ * Although script conversion is its most common use, a Transliterator can
+ * actually perform a more general class of tasks. In fact, Transliterator
+ * defines a very general API which specifies only that a segment of the input
+ * text is replaced by new text. The particulars of this conversion are
+ * determined entirely by subclasses of Transliterator.
+ * </p>
+ * <p>
+ * Some useful transformations for search are built-in:
+ * <ul>
+ * <li>Conversion from Traditional to Simplified Chinese characters
+ * <li>Conversion from Hiragana to Katakana
+ * <li>Conversion from Fullwidth to Halfwidth forms.
+ * <li>Script conversions, for example Serbian Cyrillic to Latin
+ * </ul>
+ * </p>
+ * <p>
+ * Example usage: <blockquote>stream = new ICUTransformFilter(stream,
+ * Transliterator.getInstance("Traditional-Simplified"));</blockquote>
+ * </p>
+ * For more details, see the <a
+ * href="http://userguide.icu-project.org/transforms/general">ICU User
+ * Guide</a>.
+ */
+public final class ICUTransformFilter extends TokenFilter {
+ // Transliterator to transform the text
+ private final Transliterator transform;
+
+ // Reusable position object
+ private final Transliterator.Position position = new Transliterator.Position();
+
+ // term attribute, will be updated with transformed text.
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+ // Wraps a termAttribute around the replaceable interface.
+ private final ReplaceableTermAttribute replaceableAttribute = new ReplaceableTermAttribute();
+
+ /**
+ * Create a new ICUTransformFilter that transforms text on the given stream.
+ *
+ * @param input {@link TokenStream} to filter.
+ * @param transform Transliterator to transform the text.
+ */
+ public ICUTransformFilter(TokenStream input, Transliterator transform) {
+ super(input);
+ this.transform = transform;
+
+ /*
+ * This is cheating, but speeds things up a lot.
+ * If we wanted to use pkg-private APIs we could probably do better.
+ */
+ if (transform.getFilter() == null && transform instanceof com.ibm.icu.text.RuleBasedTransliterator) {
+ final UnicodeSet sourceSet = transform.getSourceSet();
+ if (sourceSet != null && !sourceSet.isEmpty())
+ transform.setFilter(sourceSet);
+ }
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ /*
+ * Wrap around replaceable. clear the positions, and transliterate.
+ */
+ if (input.incrementToken()) {
+ replaceableAttribute.setText(termAtt);
+
+ final int length = termAtt.length();
+ position.start = 0;
+ position.limit = length;
+ position.contextStart = 0;
+ position.contextLimit = length;
+
+ transform.filteredTransliterate(replaceableAttribute, position, false);
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * Wrap a {@link CharTermAttribute} with the Replaceable API.
+ */
+ final class ReplaceableTermAttribute implements Replaceable {
+ private char buffer[];
+ private int length;
+ private CharTermAttribute token;
+
+ void setText(final CharTermAttribute token) {
+ this.token = token;
+ this.buffer = token.buffer();
+ this.length = token.length();
+ }
+
+ public int char32At(int pos) {
+ return UTF16.charAt(buffer, 0, length, pos);
+ }
+
+ public char charAt(int pos) {
+ return buffer[pos];
+ }
+
+ public void copy(int start, int limit, int dest) {
+ char text[] = new char[limit - start];
+ getChars(start, limit, text, 0);
+ replace(dest, dest, text, 0, limit - start);
+ }
+
+ public void getChars(int srcStart, int srcLimit, char[] dst, int dstStart) {
+ System.arraycopy(buffer, srcStart, dst, dstStart, srcLimit - srcStart);
+ }
+
+ public boolean hasMetaData() {
+ return false;
+ }
+
+ public int length() {
+ return length;
+ }
+
+ public void replace(int start, int limit, String text) {
+ final int charsLen = text.length();
+ final int newLength = shiftForReplace(start, limit, charsLen);
+ // insert the replacement text
+ text.getChars(0, charsLen, buffer, start);
+ token.setLength(length = newLength);
+ }
+
+ public void replace(int start, int limit, char[] text, int charsStart,
+ int charsLen) {
+ // shift text if necessary for the replacement
+ final int newLength = shiftForReplace(start, limit, charsLen);
+ // insert the replacement text
+ System.arraycopy(text, charsStart, buffer, start, charsLen);
+ token.setLength(length = newLength);
+ }
+
+ /** shift text (if necessary) for a replacement operation */
+ private int shiftForReplace(int start, int limit, int charsLen) {
+ final int replacementLength = limit - start;
+ final int newLength = length - replacementLength + charsLen;
+ // resize if necessary
+ if (newLength > length)
+ buffer = token.resizeBuffer(newLength);
+ // if the substring being replaced is longer or shorter than the
+ // replacement, need to shift things around
+ if (replacementLength != charsLen && limit < length)
+ System.arraycopy(buffer, limit, buffer, start + charsLen, length - limit);
+ return newLength;
+ }
+ }
+}