lucene-java-3.5.0/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java

   1 package org.apache.lucene.analysis.br;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.io.StringReader;
  22
  23 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
  24 import org.apache.lucene.analysis.Analyzer;
  25 import org.apache.lucene.analysis.CharArraySet;
  26 import org.apache.lucene.analysis.KeywordMarkerFilter;
  27 import org.apache.lucene.analysis.LowerCaseTokenizer;
  28
  29 /**
  30  * Test the Brazilian Stem Filter, which only modifies the term text.
  31  *
  32  * It is very similar to the snowball portuguese algorithm but not exactly the same.
  33  *
  34  */
  35 public class TestBrazilianStemmer extends BaseTokenStreamTestCase {
  36
  37   public void testWithSnowballExamples() throws Exception {
  38          check("boa", "boa");
  39          check("boainain", "boainain");
  40          check("boas", "boas");
  41          check("bôas", "boas"); // removes diacritic: different from snowball portugese
  42          check("boassu", "boassu");
  43          check("boataria", "boat");
  44          check("boate", "boat");
  45          check("boates", "boat");
  46          check("boatos", "boat");
  47          check("bob", "bob");
  48          check("boba", "bob");
  49          check("bobagem", "bobag");
  50          check("bobagens", "bobagens");
  51          check("bobalhões", "bobalho"); // removes diacritic: different from snowball portugese
  52          check("bobear", "bob");
  53          check("bobeira", "bobeir");
  54          check("bobinho", "bobinh");
  55          check("bobinhos", "bobinh");
  56          check("bobo", "bob");
  57          check("bobs", "bobs");
  58          check("boca", "boc");
  59          check("bocadas", "boc");
  60          check("bocadinho", "bocadinh");
  61          check("bocado", "boc");
  62          check("bocaiúva", "bocaiuv"); // removes diacritic: different from snowball portuguese
  63          check("boçal", "bocal"); // removes diacritic: different from snowball portuguese
  64          check("bocarra", "bocarr");
  65          check("bocas", "boc");
  66          check("bode", "bod");
  67          check("bodoque", "bodoqu");
  68          check("body", "body");
  69          check("boeing", "boeing");
  70          check("boem", "boem");
  71          check("boemia", "boem");
  72          check("boêmio", "boemi"); // removes diacritic: different from snowball portuguese
  73          check("bogotá", "bogot");
  74          check("boi", "boi");
  75          check("bóia", "boi"); // removes diacritic: different from snowball portuguese
  76          check("boiando", "boi");
  77          check("quiabo", "quiab");
  78          check("quicaram", "quic");
  79          check("quickly", "quickly");
  80          check("quieto", "quiet");
  81          check("quietos", "quiet");
  82          check("quilate", "quilat");
  83          check("quilates", "quilat");
  84          check("quilinhos", "quilinh");
  85          check("quilo", "quil");
  86          check("quilombo", "quilomb");
  87          check("quilométricas", "quilometr"); // removes diacritic: different from snowball portuguese
  88          check("quilométricos", "quilometr"); // removes diacritic: different from snowball portuguese
  89          check("quilômetro", "quilometr"); // removes diacritic: different from snowball portoguese
  90          check("quilômetros", "quilometr"); // removes diacritic: different from snowball portoguese
  91          check("quilos", "quil");
  92          check("quimica", "quimic");
  93          check("quilos", "quil");
  94          check("quimica", "quimic");
  95          check("quimicas", "quimic");
  96          check("quimico", "quimic");
  97          check("quimicos", "quimic");
  98          check("quimioterapia", "quimioterap");
  99          check("quimioterápicos", "quimioterap"); // removes diacritic: different from snowball portoguese
 100          check("quimono", "quimon");
 101          check("quincas", "quinc");
 102          check("quinhão", "quinha"); // removes diacritic: different from snowball portoguese
 103          check("quinhentos", "quinhent");
 104          check("quinn", "quinn");
 105          check("quino", "quin");
 106          check("quinta", "quint");
 107          check("quintal", "quintal");
 108          check("quintana", "quintan");
 109          check("quintanilha", "quintanilh");
 110          check("quintão", "quinta"); // removes diacritic: different from snowball portoguese
 111          check("quintessência", "quintessente"); // versus snowball portuguese 'quintessent'
 112          check("quintino", "quintin");
 113          check("quinto", "quint");
 114          check("quintos", "quint");
 115          check("quintuplicou", "quintuplic");
 116          check("quinze", "quinz");
 117          check("quinzena", "quinzen");
 118          check("quiosque", "quiosqu");
 119   }
 120
 121   public void testNormalization() throws Exception {
 122     check("Brasil", "brasil"); // lowercase by default
 123     check("Brasília", "brasil"); // remove diacritics
 124     check("quimio5terápicos", "quimio5terapicos"); // contains non-letter, diacritic will still be removed
 125     check("áá", "áá"); // token is too short: diacritics are not removed
 126     check("ááá", "aaa"); // normally, diacritics are removed
 127   }
 128
 129   public void testReusableTokenStream() throws Exception {
 130     Analyzer a = new BrazilianAnalyzer(TEST_VERSION_CURRENT);
 131     checkReuse(a, "boa", "boa");
 132     checkReuse(a, "boainain", "boainain");
 133     checkReuse(a, "boas", "boas");
 134     checkReuse(a, "bôas", "boas"); // removes diacritic: different from snowball portugese
 135   }
 136
 137   public void testStemExclusionTable() throws Exception {
 138     BrazilianAnalyzer a = new BrazilianAnalyzer(TEST_VERSION_CURRENT);
 139     a.setStemExclusionTable(new String[] { "quintessência" });
 140     checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged.
 141   }
 142
 143   public void testStemExclusionTableBWCompat() throws IOException {
 144     CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
 145     set.add("Brasília");
 146     BrazilianStemFilter filter = new BrazilianStemFilter(
 147         new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader("Brasília Brasilia")), set);
 148     assertTokenStreamContents(filter, new String[] { "brasília", "brasil" });
 149   }
 150
 151   public void testWithKeywordAttribute() throws IOException {
 152     CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
 153     set.add("Brasília");
 154     BrazilianStemFilter filter = new BrazilianStemFilter(
 155         new KeywordMarkerFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(
 156             "Brasília Brasilia")), set));
 157     assertTokenStreamContents(filter, new String[] { "brasília", "brasil" });
 158   }
 159
 160   public void testWithKeywordAttributeAndExclusionTable() throws IOException {
 161     CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
 162     set.add("Brasília");
 163     CharArraySet set1 = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
 164     set1.add("Brasilia");
 165     BrazilianStemFilter filter = new BrazilianStemFilter(
 166         new KeywordMarkerFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(
 167             "Brasília Brasilia")), set), set1);
 168     assertTokenStreamContents(filter, new String[] { "brasília", "brasilia" });
 169   }
 170
 171   /*
 172    * Test that changes to the exclusion table are applied immediately
 173    * when using reusable token streams.
 174    */
 175   public void testExclusionTableReuse() throws Exception {
 176     BrazilianAnalyzer a = new BrazilianAnalyzer(TEST_VERSION_CURRENT);
 177     checkReuse(a, "quintessência", "quintessente");
 178     a.setStemExclusionTable(new String[] { "quintessência" });
 179     checkReuse(a, "quintessência", "quintessência");
 180   }
 181
 182   private void check(final String input, final String expected) throws Exception {
 183     checkOneTerm(new BrazilianAnalyzer(TEST_VERSION_CURRENT), input, expected);
 184   }
 185
 186   private void checkReuse(Analyzer a, String input, String expected) throws Exception {
 187     checkOneTermReuse(a, input, expected);
 188   }
 189
 190   /** blast some random strings through the analyzer */
 191   public void testRandomStrings() throws Exception {
 192     checkRandomData(random, new BrazilianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
 193   }
 194 }