lucene-java-3.5.0/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/ICUTransformFilter.java

   1 package org.apache.lucene.analysis.icu;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21
  22 import org.apache.lucene.analysis.TokenFilter;
  23 import org.apache.lucene.analysis.TokenStream;
  24 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  25
  26 import com.ibm.icu.text.Replaceable;
  27 import com.ibm.icu.text.Transliterator;
  28 import com.ibm.icu.text.UTF16;
  29 import com.ibm.icu.text.UnicodeSet;
  30
  31 /**
  32  * A {@link TokenFilter} that transforms text with ICU.
  33  * <p>
  34  * ICU provides text-transformation functionality via its Transliteration API.
  35  * Although script conversion is its most common use, a Transliterator can
  36  * actually perform a more general class of tasks. In fact, Transliterator
  37  * defines a very general API which specifies only that a segment of the input
  38  * text is replaced by new text. The particulars of this conversion are
  39  * determined entirely by subclasses of Transliterator.
  40  * </p>
  41  * <p>
  42  * Some useful transformations for search are built-in:
  43  * <ul>
  44  * <li>Conversion from Traditional to Simplified Chinese characters
  45  * <li>Conversion from Hiragana to Katakana
  46  * <li>Conversion from Fullwidth to Halfwidth forms.
  47  * <li>Script conversions, for example Serbian Cyrillic to Latin
  48  * </ul>
  49  * </p>
  50  * <p>
  51  * Example usage: <blockquote>stream = new ICUTransformFilter(stream,
  52  * Transliterator.getInstance("Traditional-Simplified"));</blockquote>
  53  * </p>
  54  * For more details, see the <a
  55  * href="http://userguide.icu-project.org/transforms/general">ICU User
  56  * Guide</a>.
  57  */
  58 public final class ICUTransformFilter extends TokenFilter {
  59   // Transliterator to transform the text
  60   private final Transliterator transform;
  61
  62   // Reusable position object
  63   private final Transliterator.Position position = new Transliterator.Position();
  64
  65   // term attribute, will be updated with transformed text.
  66   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  67
  68   // Wraps a termAttribute around the replaceable interface.
  69   private final ReplaceableTermAttribute replaceableAttribute = new ReplaceableTermAttribute();
  70
  71   /**
  72    * Create a new ICUTransformFilter that transforms text on the given stream.
  73    *
  74    * @param input {@link TokenStream} to filter.
  75    * @param transform Transliterator to transform the text.
  76    */
  77   public ICUTransformFilter(TokenStream input, Transliterator transform) {
  78     super(input);
  79     this.transform = transform;
  80
  81     /*
  82      * This is cheating, but speeds things up a lot.
  83      * If we wanted to use pkg-private APIs we could probably do better.
  84      */
  85     if (transform.getFilter() == null && transform instanceof com.ibm.icu.text.RuleBasedTransliterator) {
  86       final UnicodeSet sourceSet = transform.getSourceSet();
  87       if (sourceSet != null && !sourceSet.isEmpty())
  88         transform.setFilter(sourceSet);
  89     }
  90   }
  91
  92   @Override
  93   public boolean incrementToken() throws IOException {
  94     /*
  95      * Wrap around replaceable. clear the positions, and transliterate.
  96      */
  97     if (input.incrementToken()) {
  98       replaceableAttribute.setText(termAtt);
  99
 100       final int length = termAtt.length();
 101       position.start = 0;
 102       position.limit = length;
 103       position.contextStart = 0;
 104       position.contextLimit = length;
 105
 106       transform.filteredTransliterate(replaceableAttribute, position, false);
 107       return true;
 108     } else {
 109       return false;
 110     }
 111   }
 112
 113   /**
 114    * Wrap a {@link CharTermAttribute} with the Replaceable API.
 115    */
 116   final class ReplaceableTermAttribute implements Replaceable {
 117     private char buffer[];
 118     private int length;
 119     private CharTermAttribute token;
 120
 121     void setText(final CharTermAttribute token) {
 122       this.token = token;
 123       this.buffer = token.buffer();
 124       this.length = token.length();
 125     }
 126
 127     public int char32At(int pos) {
 128       return UTF16.charAt(buffer, 0, length, pos);
 129     }
 130
 131     public char charAt(int pos) {
 132       return buffer[pos];
 133     }
 134
 135     public void copy(int start, int limit, int dest) {
 136       char text[] = new char[limit - start];
 137       getChars(start, limit, text, 0);
 138       replace(dest, dest, text, 0, limit - start);
 139     }
 140
 141     public void getChars(int srcStart, int srcLimit, char[] dst, int dstStart) {
 142       System.arraycopy(buffer, srcStart, dst, dstStart, srcLimit - srcStart);
 143     }
 144
 145     public boolean hasMetaData() {
 146       return false;
 147     }
 148
 149     public int length() {
 150       return length;
 151     }
 152
 153     public void replace(int start, int limit, String text) {
 154       final int charsLen = text.length();
 155       final int newLength = shiftForReplace(start, limit, charsLen);
 156       // insert the replacement text
 157       text.getChars(0, charsLen, buffer, start);
 158       token.setLength(length = newLength);
 159     }
 160
 161     public void replace(int start, int limit, char[] text, int charsStart,
 162         int charsLen) {
 163       // shift text if necessary for the replacement
 164       final int newLength = shiftForReplace(start, limit, charsLen);
 165       // insert the replacement text
 166       System.arraycopy(text, charsStart, buffer, start, charsLen);
 167       token.setLength(length = newLength);
 168     }
 169
 170     /** shift text (if necessary) for a replacement operation */
 171     private int shiftForReplace(int start, int limit, int charsLen) {
 172       final int replacementLength = limit - start;
 173       final int newLength = length - replacementLength + charsLen;
 174       // resize if necessary
 175       if (newLength > length)
 176         buffer = token.resizeBuffer(newLength);
 177       // if the substring being replaced is longer or shorter than the
 178       // replacement, need to shift things around
 179       if (replacementLength != charsLen && limit < length)
 180         System.arraycopy(buffer, limit, buffer, start + charsLen, length - limit);
 181       return newLength;
 182     }
 183   }
 184 }