+++ /dev/null
-package org.apache.lucene.analysis;
-
-/**
- * Copyright 2004 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import org.apache.lucene.analysis.standard.StandardFilter;
-import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.util.AttributeSource;
-import org.apache.lucene.util.English;
-import java.io.IOException;
-import java.io.StringReader;
-
-
-/**
- * tests for the TestTeeSinkTokenFilter
- */
-public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
- protected StringBuilder buffer1;
- protected StringBuilder buffer2;
- protected String[] tokens1;
- protected String[] tokens2;
-
- @Override
- public void setUp() throws Exception {
- super.setUp();
- tokens1 = new String[]{"The", "quick", "Burgundy", "Fox", "jumped", "over", "the", "lazy", "Red", "Dogs"};
- tokens2 = new String[]{"The", "Lazy", "Dogs", "should", "stay", "on", "the", "porch"};
- buffer1 = new StringBuilder();
-
- for (int i = 0; i < tokens1.length; i++) {
- buffer1.append(tokens1[i]).append(' ');
- }
- buffer2 = new StringBuilder();
- for (int i = 0; i < tokens2.length; i++) {
- buffer2.append(tokens2[i]).append(' ');
- }
- }
-
- static final TeeSinkTokenFilter.SinkFilter theFilter = new TeeSinkTokenFilter.SinkFilter() {
- @Override
- public boolean accept(AttributeSource a) {
- CharTermAttribute termAtt = a.getAttribute(CharTermAttribute.class);
- return termAtt.toString().equalsIgnoreCase("The");
- }
- };
-
- static final TeeSinkTokenFilter.SinkFilter dogFilter = new TeeSinkTokenFilter.SinkFilter() {
- @Override
- public boolean accept(AttributeSource a) {
- CharTermAttribute termAtt = a.getAttribute(CharTermAttribute.class);
- return termAtt.toString().equalsIgnoreCase("Dogs");
- }
- };
-
- public void testGeneral() throws IOException {
- final TeeSinkTokenFilter source = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false));
- final TokenStream sink1 = source.newSinkTokenStream();
- final TokenStream sink2 = source.newSinkTokenStream(theFilter);
-
- source.addAttribute(CheckClearAttributesAttribute.class);
- sink1.addAttribute(CheckClearAttributesAttribute.class);
- sink2.addAttribute(CheckClearAttributesAttribute.class);
-
- assertTokenStreamContents(source, tokens1);
- assertTokenStreamContents(sink1, tokens1);
- assertTokenStreamContents(sink2, new String[]{"The", "the"});
- }
-
- public void testMultipleSources() throws Exception {
- final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false));
- final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter);
- final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter);
- tee1.reset();
- final TokenStream source1 = new CachingTokenFilter(tee1);
-
- tee1.addAttribute(CheckClearAttributesAttribute.class);
- dogDetector.addAttribute(CheckClearAttributesAttribute.class);
- theDetector.addAttribute(CheckClearAttributesAttribute.class);
-
- final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer2.toString()), MockTokenizer.WHITESPACE, false));
- tee2.addSinkTokenStream(dogDetector);
- tee2.addSinkTokenStream(theDetector);
- final TokenStream source2 = tee2;
-
- assertTokenStreamContents(source1, tokens1);
- assertTokenStreamContents(source2, tokens2);
-
- assertTokenStreamContents(theDetector, new String[]{"The", "the", "The", "the"});
- assertTokenStreamContents(dogDetector, new String[]{"Dogs", "Dogs"});
-
- source1.reset();
- TokenStream lowerCasing = new LowerCaseFilter(TEST_VERSION_CURRENT, source1);
- String[] lowerCaseTokens = new String[tokens1.length];
- for (int i = 0; i < tokens1.length; i++)
- lowerCaseTokens[i] = tokens1[i].toLowerCase();
- assertTokenStreamContents(lowerCasing, lowerCaseTokens);
- }
-
- /**
- * Not an explicit test, just useful to print out some info on performance
- *
- * @throws Exception
- */
- public void performance() throws Exception {
- int[] tokCount = {100, 500, 1000, 2000, 5000, 10000};
- int[] modCounts = {1, 2, 5, 10, 20, 50, 100, 200, 500};
- for (int k = 0; k < tokCount.length; k++) {
- StringBuilder buffer = new StringBuilder();
- System.out.println("-----Tokens: " + tokCount[k] + "-----");
- for (int i = 0; i < tokCount[k]; i++) {
- buffer.append(English.intToEnglish(i).toUpperCase()).append(' ');
- }
- //make sure we produce the same tokens
- TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))));
- TokenStream sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(100));
- teeStream.consumeAllTokens();
- TokenStream stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))), 100);
- CharTermAttribute tfTok = stream.addAttribute(CharTermAttribute.class);
- CharTermAttribute sinkTok = sink.addAttribute(CharTermAttribute.class);
- for (int i=0; stream.incrementToken(); i++) {
- assertTrue(sink.incrementToken());
- assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.equals(sinkTok) == true);
- }
-
- //simulate two fields, each being analyzed once, for 20 documents
- for (int j = 0; j < modCounts.length; j++) {
- int tfPos = 0;
- long start = System.currentTimeMillis();
- for (int i = 0; i < 20; i++) {
- stream = new StandardFilter(new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString())));
- PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
- while (stream.incrementToken()) {
- tfPos += posIncrAtt.getPositionIncrement();
- }
- stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))), modCounts[j]);
- posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
- while (stream.incrementToken()) {
- tfPos += posIncrAtt.getPositionIncrement();
- }
- }
- long finish = System.currentTimeMillis();
- System.out.println("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
- int sinkPos = 0;
- //simulate one field with one sink
- start = System.currentTimeMillis();
- for (int i = 0; i < 20; i++) {
- teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))));
- sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(modCounts[j]));
- PositionIncrementAttribute posIncrAtt = teeStream.getAttribute(PositionIncrementAttribute.class);
- while (teeStream.incrementToken()) {
- sinkPos += posIncrAtt.getPositionIncrement();
- }
- //System.out.println("Modulo--------");
- posIncrAtt = sink.getAttribute(PositionIncrementAttribute.class);
- while (sink.incrementToken()) {
- sinkPos += posIncrAtt.getPositionIncrement();
- }
- }
- finish = System.currentTimeMillis();
- System.out.println("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
- assertTrue(sinkPos + " does not equal: " + tfPos, sinkPos == tfPos);
-
- }
- System.out.println("- End Tokens: " + tokCount[k] + "-----");
- }
-
- }
-
-
- class ModuloTokenFilter extends TokenFilter {
-
- int modCount;
-
- ModuloTokenFilter(TokenStream input, int mc) {
- super(input);
- modCount = mc;
- }
-
- int count = 0;
-
- //return every 100 tokens
- @Override
- public boolean incrementToken() throws IOException {
- boolean hasNext;
- for (hasNext = input.incrementToken();
- hasNext && count % modCount != 0;
- hasNext = input.incrementToken()) {
- count++;
- }
- count++;
- return hasNext;
- }
- }
-
- class ModuloSinkFilter extends TeeSinkTokenFilter.SinkFilter {
- int count = 0;
- int modCount;
-
- ModuloSinkFilter(int mc) {
- modCount = mc;
- }
-
- @Override
- public boolean accept(AttributeSource a) {
- boolean b = (a != null && count % modCount == 0);
- count++;
- return b;
- }
-
- }
-}
-