lucene-java-3.5.0/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilter.java

   1 package org.apache.lucene.analysis.icu;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.io.Reader;
  22
  23 import org.apache.lucene.analysis.Analyzer;
  24 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
  25 import org.apache.lucene.analysis.TokenStream;
  26 import org.apache.lucene.analysis.WhitespaceTokenizer;
  27
  28 /**
  29  * Tests ICUFoldingFilter
  30  */
  31 public class TestICUFoldingFilter extends BaseTokenStreamTestCase {
  32   Analyzer a = new Analyzer() {
  33     @Override
  34     public TokenStream tokenStream(String fieldName, Reader reader) {
  35       return new ICUFoldingFilter(
  36           new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader));
  37     }
  38   };
  39   public void testDefaults() throws IOException {
  40     // case folding
  41     assertAnalyzesTo(a, "This is a test", new String[] { "this", "is", "a", "test" });
  42
  43     // case folding
  44     assertAnalyzesTo(a, "Ruß", new String[] { "russ" });
  45
  46     // case folding with accent removal
  47     assertAnalyzesTo(a, "ΜΆΪΟΣ", new String[] { "μαιοσ" });
  48     assertAnalyzesTo(a, "Μάϊος", new String[] { "μαιοσ" });
  49
  50     // supplementary case folding
  51     assertAnalyzesTo(a, "𐐖", new String[] { "𐐾" });
  52
  53     // normalization
  54     assertAnalyzesTo(a, "ﴳﴺﰧ", new String[] { "طمطمطم" });
  55
  56     // removal of default ignorables
  57     assertAnalyzesTo(a, "क्‍ष", new String[] { "कष" });
  58
  59     // removal of latin accents (composed)
  60     assertAnalyzesTo(a, "résumé", new String[] { "resume" });
  61
  62     // removal of latin accents (decomposed)
  63     assertAnalyzesTo(a, "re\u0301sume\u0301", new String[] { "resume" });
  64
  65     // fold native digits
  66     assertAnalyzesTo(a, "৭০৬", new String[] { "706" });
  67
  68     // ascii-folding-filter type stuff
  69     assertAnalyzesTo(a, "đis is cræzy", new String[] { "dis", "is", "craezy" });
  70
  71     // proper downcasing of Turkish dotted-capital I
  72     // (according to default case folding rules)
  73     assertAnalyzesTo(a, "ELİF", new String[] { "elif" });
  74
  75     // handling of decomposed combining-dot-above
  76     assertAnalyzesTo(a, "eli\u0307f", new String[] { "elif" });
  77   }
  78
  79   /** blast some random strings through the analyzer */
  80   public void testRandomStrings() throws Exception {
  81     checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
  82   }
  83 }