1 package org.apache.lucene.analysis;
4 * Copyright 2004 The Apache Software Foundation
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
19 import org.apache.lucene.analysis.standard.StandardFilter;
20 import org.apache.lucene.analysis.standard.StandardTokenizer;
21 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
22 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
23 import org.apache.lucene.util.AttributeSource;
24 import org.apache.lucene.util.English;
25 import java.io.IOException;
26 import java.io.StringReader;
30 * tests for the TestTeeSinkTokenFilter
32 public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
33 protected StringBuilder buffer1;
34 protected StringBuilder buffer2;
35 protected String[] tokens1;
36 protected String[] tokens2;
39 public void setUp() throws Exception {
41 tokens1 = new String[]{"The", "quick", "Burgundy", "Fox", "jumped", "over", "the", "lazy", "Red", "Dogs"};
42 tokens2 = new String[]{"The", "Lazy", "Dogs", "should", "stay", "on", "the", "porch"};
43 buffer1 = new StringBuilder();
45 for (int i = 0; i < tokens1.length; i++) {
46 buffer1.append(tokens1[i]).append(' ');
48 buffer2 = new StringBuilder();
49 for (int i = 0; i < tokens2.length; i++) {
50 buffer2.append(tokens2[i]).append(' ');
54 static final TeeSinkTokenFilter.SinkFilter theFilter = new TeeSinkTokenFilter.SinkFilter() {
56 public boolean accept(AttributeSource a) {
57 CharTermAttribute termAtt = a.getAttribute(CharTermAttribute.class);
58 return termAtt.toString().equalsIgnoreCase("The");
62 static final TeeSinkTokenFilter.SinkFilter dogFilter = new TeeSinkTokenFilter.SinkFilter() {
64 public boolean accept(AttributeSource a) {
65 CharTermAttribute termAtt = a.getAttribute(CharTermAttribute.class);
66 return termAtt.toString().equalsIgnoreCase("Dogs");
70 public void testGeneral() throws IOException {
71 final TeeSinkTokenFilter source = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false));
72 final TokenStream sink1 = source.newSinkTokenStream();
73 final TokenStream sink2 = source.newSinkTokenStream(theFilter);
75 source.addAttribute(CheckClearAttributesAttribute.class);
76 sink1.addAttribute(CheckClearAttributesAttribute.class);
77 sink2.addAttribute(CheckClearAttributesAttribute.class);
79 assertTokenStreamContents(source, tokens1);
80 assertTokenStreamContents(sink1, tokens1);
81 assertTokenStreamContents(sink2, new String[]{"The", "the"});
84 public void testMultipleSources() throws Exception {
85 final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false));
86 final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter);
87 final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter);
89 final TokenStream source1 = new CachingTokenFilter(tee1);
91 tee1.addAttribute(CheckClearAttributesAttribute.class);
92 dogDetector.addAttribute(CheckClearAttributesAttribute.class);
93 theDetector.addAttribute(CheckClearAttributesAttribute.class);
95 final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer2.toString()), MockTokenizer.WHITESPACE, false));
96 tee2.addSinkTokenStream(dogDetector);
97 tee2.addSinkTokenStream(theDetector);
98 final TokenStream source2 = tee2;
100 assertTokenStreamContents(source1, tokens1);
101 assertTokenStreamContents(source2, tokens2);
103 assertTokenStreamContents(theDetector, new String[]{"The", "the", "The", "the"});
104 assertTokenStreamContents(dogDetector, new String[]{"Dogs", "Dogs"});
107 TokenStream lowerCasing = new LowerCaseFilter(TEST_VERSION_CURRENT, source1);
108 String[] lowerCaseTokens = new String[tokens1.length];
109 for (int i = 0; i < tokens1.length; i++)
110 lowerCaseTokens[i] = tokens1[i].toLowerCase();
111 assertTokenStreamContents(lowerCasing, lowerCaseTokens);
115 * Not an explicit test, just useful to print out some info on performance
119 public void performance() throws Exception {
120 int[] tokCount = {100, 500, 1000, 2000, 5000, 10000};
121 int[] modCounts = {1, 2, 5, 10, 20, 50, 100, 200, 500};
122 for (int k = 0; k < tokCount.length; k++) {
123 StringBuilder buffer = new StringBuilder();
124 System.out.println("-----Tokens: " + tokCount[k] + "-----");
125 for (int i = 0; i < tokCount[k]; i++) {
126 buffer.append(English.intToEnglish(i).toUpperCase()).append(' ');
128 //make sure we produce the same tokens
129 TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))));
130 TokenStream sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(100));
131 teeStream.consumeAllTokens();
132 TokenStream stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))), 100);
133 CharTermAttribute tfTok = stream.addAttribute(CharTermAttribute.class);
134 CharTermAttribute sinkTok = sink.addAttribute(CharTermAttribute.class);
135 for (int i=0; stream.incrementToken(); i++) {
136 assertTrue(sink.incrementToken());
137 assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.equals(sinkTok) == true);
140 //simulate two fields, each being analyzed once, for 20 documents
141 for (int j = 0; j < modCounts.length; j++) {
143 long start = System.currentTimeMillis();
144 for (int i = 0; i < 20; i++) {
145 stream = new StandardFilter(new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString())));
146 PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
147 while (stream.incrementToken()) {
148 tfPos += posIncrAtt.getPositionIncrement();
150 stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))), modCounts[j]);
151 posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
152 while (stream.incrementToken()) {
153 tfPos += posIncrAtt.getPositionIncrement();
156 long finish = System.currentTimeMillis();
157 System.out.println("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
159 //simulate one field with one sink
160 start = System.currentTimeMillis();
161 for (int i = 0; i < 20; i++) {
162 teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))));
163 sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(modCounts[j]));
164 PositionIncrementAttribute posIncrAtt = teeStream.getAttribute(PositionIncrementAttribute.class);
165 while (teeStream.incrementToken()) {
166 sinkPos += posIncrAtt.getPositionIncrement();
168 //System.out.println("Modulo--------");
169 posIncrAtt = sink.getAttribute(PositionIncrementAttribute.class);
170 while (sink.incrementToken()) {
171 sinkPos += posIncrAtt.getPositionIncrement();
174 finish = System.currentTimeMillis();
175 System.out.println("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
176 assertTrue(sinkPos + " does not equal: " + tfPos, sinkPos == tfPos);
179 System.out.println("- End Tokens: " + tokCount[k] + "-----");
185 class ModuloTokenFilter extends TokenFilter {
189 ModuloTokenFilter(TokenStream input, int mc) {
196 //return every 100 tokens
198 public boolean incrementToken() throws IOException {
200 for (hasNext = input.incrementToken();
201 hasNext && count % modCount != 0;
202 hasNext = input.incrementToken()) {
210 class ModuloSinkFilter extends TeeSinkTokenFilter.SinkFilter {
214 ModuloSinkFilter(int mc) {
219 public boolean accept(AttributeSource a) {
220 boolean b = (a != null && count % modCount == 0);