1 package org.apache.lucene.analysis.pt;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import static org.apache.lucene.analysis.VocabularyAssert.assertVocabulary;
22 import java.io.IOException;
23 import java.io.Reader;
25 import org.apache.lucene.analysis.Analyzer;
26 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
27 import org.apache.lucene.analysis.TokenStream;
28 import org.apache.lucene.analysis.Tokenizer;
29 import org.apache.lucene.analysis.LowerCaseFilter;
30 import org.apache.lucene.analysis.standard.StandardTokenizer;
31 import org.apache.lucene.analysis.ReusableAnalyzerBase;
34 * Simple tests for {@link PortugueseStemFilter}
36 public class TestPortugueseStemFilter extends BaseTokenStreamTestCase {
37 private Analyzer analyzer = new ReusableAnalyzerBase() {
39 protected TokenStreamComponents createComponents(String fieldName,
41 Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
42 TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source);
43 return new TokenStreamComponents(source, new PortugueseStemFilter(result));
48 * Test the example from the paper "Assessing the impact of stemming accuracy
49 * on information retrieval"
51 public void testExamples() throws IOException {
54 "O debate político, pelo menos o que vem a público, parece, de modo nada "
55 + "surpreendente, restrito a temas menores. Mas há, evidentemente, "
56 + "grandes questões em jogo nas eleições que se aproximam.",
58 "o", "debat", "politic", "pel", "menos", "o", "que", "vem", "a",
59 "public", "parec", "de", "mod", "nad", "surpreend", "restrit",
60 "a", "tem", "men", "mas", "ha", "evid", "grand", "quest",
61 "em", "jog", "na", "eleic", "que", "se", "aproxim"
65 /** Test against a vocabulary from the reference impl */
66 public void testVocabulary() throws IOException {
67 assertVocabulary(analyzer, getDataFile("ptrslptestdata.zip"), "ptrslp.txt");
70 /** blast some random strings through the analyzer */
71 public void testRandomStrings() throws Exception {
72 checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);