1 package org.apache.lucene.analysis;
4 * Copyright 2005 The Apache Software Foundation
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
19 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
20 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
21 import org.apache.lucene.util.English;
22 import org.apache.lucene.util.Version;
24 import java.io.IOException;
25 import java.io.StringReader;
26 import java.util.ArrayList;
27 import java.util.Arrays;
29 import java.util.HashSet;
31 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
32 import org.apache.lucene.analysis.MockTokenizer;
33 import org.apache.lucene.analysis.TokenStream;
35 public class TestStopFilter extends BaseTokenStreamTestCase {
37 // other StopFilter functionality is already tested by TestStopAnalyzer
39 public void testExactCase() throws IOException {
40 StringReader reader = new StringReader("Now is The Time");
41 Set<String> stopWords = new HashSet<String>(Arrays.asList("is", "the", "Time"));
42 TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopWords, false);
43 assertTokenStreamContents(stream, new String[] { "Now", "The" });
46 public void testIgnoreCase() throws IOException {
47 StringReader reader = new StringReader("Now is The Time");
48 Set<Object> stopWords = new HashSet<Object>(Arrays.asList( "is", "the", "Time" ));
49 TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopWords, true);
50 assertTokenStreamContents(stream, new String[] { "Now" });
53 public void testStopFilt() throws IOException {
54 StringReader reader = new StringReader("Now is The Time");
55 String[] stopWords = new String[] { "is", "the", "Time" };
56 Set<Object> stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords);
57 TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
58 assertTokenStreamContents(stream, new String[] { "Now", "The" });
62 * Test Position increments applied by StopFilter with and without enabling this option.
64 public void testStopPositons() throws IOException {
65 StringBuilder sb = new StringBuilder();
66 ArrayList<String> a = new ArrayList<String>();
67 for (int i=0; i<20; i++) {
68 String w = English.intToEnglish(i).trim();
69 sb.append(w).append(" ");
70 if (i%3 != 0) a.add(w);
73 String stopWords[] = a.toArray(new String[0]);
74 for (int i=0; i<a.size(); i++) log("Stop: "+stopWords[i]);
75 Set<Object> stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords);
77 StringReader reader = new StringReader(sb.toString());
78 StopFilter stpf = new StopFilter(Version.LUCENE_24, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
79 doTestStopPositons(stpf,true);
81 reader = new StringReader(sb.toString());
82 stpf = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
83 doTestStopPositons(stpf,false);
84 // with increments, concatenating two stop filters
85 ArrayList<String> a0 = new ArrayList<String>();
86 ArrayList<String> a1 = new ArrayList<String>();
87 for (int i=0; i<a.size(); i++) {
94 String stopWords0[] = a0.toArray(new String[0]);
95 for (int i=0; i<a0.size(); i++) log("Stop0: "+stopWords0[i]);
96 String stopWords1[] = a1.toArray(new String[0]);
97 for (int i=0; i<a1.size(); i++) log("Stop1: "+stopWords1[i]);
98 Set<Object> stopSet0 = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords0);
99 Set<Object> stopSet1 = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords1);
100 reader = new StringReader(sb.toString());
101 StopFilter stpf0 = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet0); // first part of the set
102 stpf0.setEnablePositionIncrements(true);
103 StopFilter stpf01 = new StopFilter(TEST_VERSION_CURRENT, stpf0, stopSet1); // two stop filters concatenated!
104 doTestStopPositons(stpf01,true);
107 private void doTestStopPositons(StopFilter stpf, boolean enableIcrements) throws IOException {
108 log("---> test with enable-increments-"+(enableIcrements?"enabled":"disabled"));
109 stpf.setEnablePositionIncrements(enableIcrements);
110 CharTermAttribute termAtt = stpf.getAttribute(CharTermAttribute.class);
111 PositionIncrementAttribute posIncrAtt = stpf.getAttribute(PositionIncrementAttribute.class);
113 for (int i=0; i<20; i+=3) {
114 assertTrue(stpf.incrementToken());
115 log("Token "+i+": "+stpf);
116 String w = English.intToEnglish(i).trim();
117 assertEquals("expecting token "+i+" to be "+w,w,termAtt.toString());
118 assertEquals("all but first token must have position increment of 3",enableIcrements?(i==0?1:3):1,posIncrAtt.getPositionIncrement());
120 assertFalse(stpf.incrementToken());
125 // print debug info depending on VERBOSE
126 private static void log(String s) {
128 System.out.println(s);