lucene-java-3.4.0/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilter.java

   1 package org.apache.lucene.analysis.icu;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.io.Reader;
  22 import java.io.StringReader;
  23
  24 import org.apache.lucene.analysis.Analyzer;
  25 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
  26 import org.apache.lucene.analysis.Tokenizer;
  27 import org.apache.lucene.analysis.KeywordTokenizer;
  28 import org.apache.lucene.analysis.WhitespaceTokenizer;
  29 import org.apache.lucene.analysis.ReusableAnalyzerBase;
  30 import org.apache.lucene.analysis.TokenStream;
  31
  32 import com.ibm.icu.text.Transliterator;
  33 import com.ibm.icu.text.UnicodeSet;
  34
  35
  36 /**
  37  * Test the ICUTransformFilter with some basic examples.
  38  */
  39 public class TestICUTransformFilter extends BaseTokenStreamTestCase {
  40
  41   public void testBasicFunctionality() throws Exception {
  42     checkToken(Transliterator.getInstance("Traditional-Simplified"),
  43         "簡化字", "简化字");
  44     checkToken(Transliterator.getInstance("Katakana-Hiragana"),
  45         "ヒラガナ", "ひらがな");
  46     checkToken(Transliterator.getInstance("Fullwidth-Halfwidth"),
  47         "アルアノリウ", "ｱﾙｱﾉﾘｳ");
  48     checkToken(Transliterator.getInstance("Any-Latin"),
  49         "Αλφαβητικός Κατάλογος", "Alphabētikós Katálogos");
  50     checkToken(Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove"),
  51         "Alphabētikós Katálogos", "Alphabetikos Katalogos");
  52     checkToken(Transliterator.getInstance("Han-Latin"),
  53         "中国", "zhōng guó");
  54   }
  55
  56   public void testCustomFunctionality() throws Exception {
  57     String rules = "a > b; b > c;"; // convert a's to b's and b's to c's
  58     checkToken(Transliterator.createFromRules("test", rules, Transliterator.FORWARD), "abacadaba", "bcbcbdbcb");
  59   }
  60
  61   public void testCustomFunctionality2() throws Exception {
  62     String rules = "c { a > b; a > d;"; // convert a's to b's and b's to c's
  63     checkToken(Transliterator.createFromRules("test", rules, Transliterator.FORWARD), "caa", "cbd");
  64   }
  65
  66   public void testOptimizer() throws Exception {
  67     String rules = "a > b; b > c;"; // convert a's to b's and b's to c's
  68     Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
  69     assertTrue(custom.getFilter() == null);
  70     new ICUTransformFilter(new KeywordTokenizer(new StringReader("")), custom);
  71     assertTrue(custom.getFilter().equals(new UnicodeSet("[ab]")));
  72   }
  73
  74   public void testOptimizer2() throws Exception {
  75     checkToken(Transliterator.getInstance("Traditional-Simplified; CaseFold"),
  76         "ABCDE", "abcde");
  77   }
  78
  79   public void testOptimizerSurrogate() throws Exception {
  80     String rules = "\\U00020087 > x;"; // convert CJK UNIFIED IDEOGRAPH-20087 to an x
  81     Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
  82     assertTrue(custom.getFilter() == null);
  83     new ICUTransformFilter(new KeywordTokenizer(new StringReader("")), custom);
  84     assertTrue(custom.getFilter().equals(new UnicodeSet("[\\U00020087]")));
  85   }
  86
  87   private void checkToken(Transliterator transform, String input, String expected) throws IOException {
  88     TokenStream ts = new ICUTransformFilter(new KeywordTokenizer((new StringReader(input))), transform);
  89     assertTokenStreamContents(ts, new String[] { expected });
  90   }
  91
  92   /** blast some random strings through the analyzer */
  93   public void testRandomStrings() throws Exception {
  94     final Transliterator transform = Transliterator.getInstance("Any-Latin");
  95     Analyzer a = new ReusableAnalyzerBase() {
  96       @Override
  97       protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
  98         Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
  99         return new TokenStreamComponents(tokenizer, new ICUTransformFilter(tokenizer, transform));
 100       }
 101     };
 102     checkRandomData(random, a, 1000*RANDOM_MULTIPLIER);
 103   }
 104 }