1 package org.apache.lucene.analysis.icu;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
22 import org.apache.lucene.analysis.TokenFilter;
23 import org.apache.lucene.analysis.TokenStream;
24 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
26 import com.ibm.icu.text.Replaceable;
27 import com.ibm.icu.text.Transliterator;
28 import com.ibm.icu.text.UTF16;
29 import com.ibm.icu.text.UnicodeSet;
32 * A {@link TokenFilter} that transforms text with ICU.
34 * ICU provides text-transformation functionality via its Transliteration API.
35 * Although script conversion is its most common use, a Transliterator can
36 * actually perform a more general class of tasks. In fact, Transliterator
37 * defines a very general API which specifies only that a segment of the input
38 * text is replaced by new text. The particulars of this conversion are
39 * determined entirely by subclasses of Transliterator.
42 * Some useful transformations for search are built-in:
44 * <li>Conversion from Traditional to Simplified Chinese characters
45 * <li>Conversion from Hiragana to Katakana
46 * <li>Conversion from Fullwidth to Halfwidth forms.
47 * <li>Script conversions, for example Serbian Cyrillic to Latin
51 * Example usage: <blockquote>stream = new ICUTransformFilter(stream,
52 * Transliterator.getInstance("Traditional-Simplified"));</blockquote>
54 * For more details, see the <a
55 * href="http://userguide.icu-project.org/transforms/general">ICU User
58 public final class ICUTransformFilter extends TokenFilter {
59 // Transliterator to transform the text
60 private final Transliterator transform;
62 // Reusable position object
63 private final Transliterator.Position position = new Transliterator.Position();
65 // term attribute, will be updated with transformed text.
66 private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
68 // Wraps a termAttribute around the replaceable interface.
69 private final ReplaceableTermAttribute replaceableAttribute = new ReplaceableTermAttribute();
72 * Create a new ICUTransformFilter that transforms text on the given stream.
74 * @param input {@link TokenStream} to filter.
75 * @param transform Transliterator to transform the text.
77 public ICUTransformFilter(TokenStream input, Transliterator transform) {
79 this.transform = transform;
82 * This is cheating, but speeds things up a lot.
83 * If we wanted to use pkg-private APIs we could probably do better.
85 if (transform.getFilter() == null && transform instanceof com.ibm.icu.text.RuleBasedTransliterator) {
86 final UnicodeSet sourceSet = transform.getSourceSet();
87 if (sourceSet != null && !sourceSet.isEmpty())
88 transform.setFilter(sourceSet);
93 public boolean incrementToken() throws IOException {
95 * Wrap around replaceable. clear the positions, and transliterate.
97 if (input.incrementToken()) {
98 replaceableAttribute.setText(termAtt);
100 final int length = termAtt.length();
102 position.limit = length;
103 position.contextStart = 0;
104 position.contextLimit = length;
106 transform.filteredTransliterate(replaceableAttribute, position, false);
114 * Wrap a {@link CharTermAttribute} with the Replaceable API.
116 final class ReplaceableTermAttribute implements Replaceable {
117 private char buffer[];
119 private CharTermAttribute token;
121 void setText(final CharTermAttribute token) {
123 this.buffer = token.buffer();
124 this.length = token.length();
127 public int char32At(int pos) {
128 return UTF16.charAt(buffer, 0, length, pos);
131 public char charAt(int pos) {
135 public void copy(int start, int limit, int dest) {
136 char text[] = new char[limit - start];
137 getChars(start, limit, text, 0);
138 replace(dest, dest, text, 0, limit - start);
141 public void getChars(int srcStart, int srcLimit, char[] dst, int dstStart) {
142 System.arraycopy(buffer, srcStart, dst, dstStart, srcLimit - srcStart);
145 public boolean hasMetaData() {
149 public int length() {
153 public void replace(int start, int limit, String text) {
154 final int charsLen = text.length();
155 final int newLength = shiftForReplace(start, limit, charsLen);
156 // insert the replacement text
157 text.getChars(0, charsLen, buffer, start);
158 token.setLength(length = newLength);
161 public void replace(int start, int limit, char[] text, int charsStart,
163 // shift text if necessary for the replacement
164 final int newLength = shiftForReplace(start, limit, charsLen);
165 // insert the replacement text
166 System.arraycopy(text, charsStart, buffer, start, charsLen);
167 token.setLength(length = newLength);
170 /** shift text (if necessary) for a replacement operation */
171 private int shiftForReplace(int start, int limit, int charsLen) {
172 final int replacementLength = limit - start;
173 final int newLength = length - replacementLength + charsLen;
174 // resize if necessary
175 if (newLength > length)
176 buffer = token.resizeBuffer(newLength);
177 // if the substring being replaced is longer or shorter than the
178 // replacement, need to shift things around
179 if (replacementLength != charsLen && limit < length)
180 System.arraycopy(buffer, limit, buffer, start + charsLen, length - limit);