lucene-java-3.4.0/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2Filter.java

   1 package org.apache.lucene.analysis.icu;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21
  22 import org.apache.lucene.analysis.TokenFilter;
  23 import org.apache.lucene.analysis.TokenStream;
  24 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  25
  26 import com.ibm.icu.text.Normalizer;
  27 import com.ibm.icu.text.Normalizer2;
  28
  29 /**
  30  * Normalize token text with ICU's {@link com.ibm.icu.text.Normalizer2}
  31  * <p>
  32  * With this filter, you can normalize text in the following ways:
  33  * <ul>
  34  *  <li> NFKC Normalization, Case Folding, and removing Ignorables (the default)
  35  *  <li> Using a standard Normalization mode (NFC, NFD, NFKC, NFKD)
  36  *  <li> Based on rules from a custom normalization mapping.
  37  * </ul>
  38  * <p>
  39  * If you use the defaults, this filter is a simple way to standardize Unicode text
  40  * in a language-independent way for search:
  41  * <ul>
  42  *  <li> The case folding that it does can be seen as a replacement for
  43  *  LowerCaseFilter: For example, it handles cases such as the Greek sigma, so that
  44  * "Μάϊος" and "ΜΆΪΟΣ" will match correctly.
  45  *  <li> The normalization will standardizes different forms of the same
  46  *  character in Unicode. For example, CJK full-width numbers will be standardized
  47  *  to their ASCII forms.
  48  *  <li> Ignorables such as Zero-Width Joiner and Variation Selectors are removed.
  49  *  These are typically modifier characters that affect display.
  50  * </ul>
  51  *
  52  * @see com.ibm.icu.text.Normalizer2
  53  * @see com.ibm.icu.text.FilteredNormalizer2
  54  */
  55 public class ICUNormalizer2Filter extends TokenFilter {
  56   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  57   private final Normalizer2 normalizer;
  58   private final StringBuilder buffer = new StringBuilder();
  59
  60   /**
  61    * Create a new Normalizer2Filter that combines NFKC normalization, Case
  62    * Folding, and removes Default Ignorables (NFKC_Casefold)
  63    */
  64   public ICUNormalizer2Filter(TokenStream input) {
  65     this(input, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
  66   }
  67
  68   /**
  69    * Create a new Normalizer2Filter with the specified Normalizer2
  70    * @param input stream
  71    * @param normalizer normalizer to use
  72    */
  73   public ICUNormalizer2Filter(TokenStream input, Normalizer2 normalizer) {
  74     super(input);
  75     this.normalizer = normalizer;
  76   }
  77
  78   @Override
  79   public final boolean incrementToken() throws IOException {
  80     if (input.incrementToken()) {
  81       if (normalizer.quickCheck(termAtt) != Normalizer.YES) {
  82         buffer.setLength(0);
  83         normalizer.normalize(termAtt, buffer);
  84         termAtt.setEmpty().append(buffer);
  85       }
  86       return true;
  87     } else {
  88       return false;
  89     }
  90   }
  91 }