1 package org.apache.lucene.analysis.br;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.io.StringReader;
23 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
24 import org.apache.lucene.analysis.Analyzer;
25 import org.apache.lucene.analysis.CharArraySet;
26 import org.apache.lucene.analysis.KeywordMarkerFilter;
27 import org.apache.lucene.analysis.LowerCaseTokenizer;
30 * Test the Brazilian Stem Filter, which only modifies the term text.
32 * It is very similar to the snowball portuguese algorithm but not exactly the same.
35 public class TestBrazilianStemmer extends BaseTokenStreamTestCase {
37 public void testWithSnowballExamples() throws Exception {
39 check("boainain", "boainain");
40 check("boas", "boas");
41 check("bôas", "boas"); // removes diacritic: different from snowball portugese
42 check("boassu", "boassu");
43 check("boataria", "boat");
44 check("boate", "boat");
45 check("boates", "boat");
46 check("boatos", "boat");
49 check("bobagem", "bobag");
50 check("bobagens", "bobagens");
51 check("bobalhões", "bobalho"); // removes diacritic: different from snowball portugese
52 check("bobear", "bob");
53 check("bobeira", "bobeir");
54 check("bobinho", "bobinh");
55 check("bobinhos", "bobinh");
57 check("bobs", "bobs");
59 check("bocadas", "boc");
60 check("bocadinho", "bocadinh");
61 check("bocado", "boc");
62 check("bocaiúva", "bocaiuv"); // removes diacritic: different from snowball portuguese
63 check("boçal", "bocal"); // removes diacritic: different from snowball portuguese
64 check("bocarra", "bocarr");
65 check("bocas", "boc");
67 check("bodoque", "bodoqu");
68 check("body", "body");
69 check("boeing", "boeing");
70 check("boem", "boem");
71 check("boemia", "boem");
72 check("boêmio", "boemi"); // removes diacritic: different from snowball portuguese
73 check("bogotá", "bogot");
75 check("bóia", "boi"); // removes diacritic: different from snowball portuguese
76 check("boiando", "boi");
77 check("quiabo", "quiab");
78 check("quicaram", "quic");
79 check("quickly", "quickly");
80 check("quieto", "quiet");
81 check("quietos", "quiet");
82 check("quilate", "quilat");
83 check("quilates", "quilat");
84 check("quilinhos", "quilinh");
85 check("quilo", "quil");
86 check("quilombo", "quilomb");
87 check("quilométricas", "quilometr"); // removes diacritic: different from snowball portuguese
88 check("quilométricos", "quilometr"); // removes diacritic: different from snowball portuguese
89 check("quilômetro", "quilometr"); // removes diacritic: different from snowball portoguese
90 check("quilômetros", "quilometr"); // removes diacritic: different from snowball portoguese
91 check("quilos", "quil");
92 check("quimica", "quimic");
93 check("quilos", "quil");
94 check("quimica", "quimic");
95 check("quimicas", "quimic");
96 check("quimico", "quimic");
97 check("quimicos", "quimic");
98 check("quimioterapia", "quimioterap");
99 check("quimioterápicos", "quimioterap"); // removes diacritic: different from snowball portoguese
100 check("quimono", "quimon");
101 check("quincas", "quinc");
102 check("quinhão", "quinha"); // removes diacritic: different from snowball portoguese
103 check("quinhentos", "quinhent");
104 check("quinn", "quinn");
105 check("quino", "quin");
106 check("quinta", "quint");
107 check("quintal", "quintal");
108 check("quintana", "quintan");
109 check("quintanilha", "quintanilh");
110 check("quintão", "quinta"); // removes diacritic: different from snowball portoguese
111 check("quintessência", "quintessente"); // versus snowball portuguese 'quintessent'
112 check("quintino", "quintin");
113 check("quinto", "quint");
114 check("quintos", "quint");
115 check("quintuplicou", "quintuplic");
116 check("quinze", "quinz");
117 check("quinzena", "quinzen");
118 check("quiosque", "quiosqu");
121 public void testNormalization() throws Exception {
122 check("Brasil", "brasil"); // lowercase by default
123 check("Brasília", "brasil"); // remove diacritics
124 check("quimio5terápicos", "quimio5terapicos"); // contains non-letter, diacritic will still be removed
125 check("áá", "áá"); // token is too short: diacritics are not removed
126 check("ááá", "aaa"); // normally, diacritics are removed
129 public void testReusableTokenStream() throws Exception {
130 Analyzer a = new BrazilianAnalyzer(TEST_VERSION_CURRENT);
131 checkReuse(a, "boa", "boa");
132 checkReuse(a, "boainain", "boainain");
133 checkReuse(a, "boas", "boas");
134 checkReuse(a, "bôas", "boas"); // removes diacritic: different from snowball portugese
137 public void testStemExclusionTable() throws Exception {
138 BrazilianAnalyzer a = new BrazilianAnalyzer(TEST_VERSION_CURRENT);
139 a.setStemExclusionTable(new String[] { "quintessência" });
140 checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged.
143 public void testStemExclusionTableBWCompat() throws IOException {
144 CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
146 BrazilianStemFilter filter = new BrazilianStemFilter(
147 new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader("Brasília Brasilia")), set);
148 assertTokenStreamContents(filter, new String[] { "brasília", "brasil" });
151 public void testWithKeywordAttribute() throws IOException {
152 CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
154 BrazilianStemFilter filter = new BrazilianStemFilter(
155 new KeywordMarkerFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(
156 "Brasília Brasilia")), set));
157 assertTokenStreamContents(filter, new String[] { "brasília", "brasil" });
160 public void testWithKeywordAttributeAndExclusionTable() throws IOException {
161 CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
163 CharArraySet set1 = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
164 set1.add("Brasilia");
165 BrazilianStemFilter filter = new BrazilianStemFilter(
166 new KeywordMarkerFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(
167 "Brasília Brasilia")), set), set1);
168 assertTokenStreamContents(filter, new String[] { "brasília", "brasilia" });
172 * Test that changes to the exclusion table are applied immediately
173 * when using reusable token streams.
175 public void testExclusionTableReuse() throws Exception {
176 BrazilianAnalyzer a = new BrazilianAnalyzer(TEST_VERSION_CURRENT);
177 checkReuse(a, "quintessência", "quintessente");
178 a.setStemExclusionTable(new String[] { "quintessência" });
179 checkReuse(a, "quintessência", "quintessência");
182 private void check(final String input, final String expected) throws Exception {
183 checkOneTerm(new BrazilianAnalyzer(TEST_VERSION_CURRENT), input, expected);
186 private void checkReuse(Analyzer a, String input, String expected) throws Exception {
187 checkOneTermReuse(a, input, expected);
190 /** blast some random strings through the analyzer */
191 public void testRandomStrings() throws Exception {
192 checkRandomData(random, new BrazilianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);