X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/backwards/src/test/org/apache/lucene/analysis/TestTeeSinkTokenFilter.java diff --git a/lucene-java-3.5.0/lucene/backwards/src/test/org/apache/lucene/analysis/TestTeeSinkTokenFilter.java b/lucene-java-3.5.0/lucene/backwards/src/test/org/apache/lucene/analysis/TestTeeSinkTokenFilter.java new file mode 100644 index 0000000..14b766e --- /dev/null +++ b/lucene-java-3.5.0/lucene/backwards/src/test/org/apache/lucene/analysis/TestTeeSinkTokenFilter.java @@ -0,0 +1,227 @@ +package org.apache.lucene.analysis; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.English; +import java.io.IOException; +import java.io.StringReader; + + +/** + * tests for the TestTeeSinkTokenFilter + */ +public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase { + protected StringBuilder buffer1; + protected StringBuilder buffer2; + protected String[] tokens1; + protected String[] tokens2; + + @Override + public void setUp() throws Exception { + super.setUp(); + tokens1 = new String[]{"The", "quick", "Burgundy", "Fox", "jumped", "over", "the", "lazy", "Red", "Dogs"}; + tokens2 = new String[]{"The", "Lazy", "Dogs", "should", "stay", "on", "the", "porch"}; + buffer1 = new StringBuilder(); + + for (int i = 0; i < tokens1.length; i++) { + buffer1.append(tokens1[i]).append(' '); + } + buffer2 = new StringBuilder(); + for (int i = 0; i < tokens2.length; i++) { + buffer2.append(tokens2[i]).append(' '); + } + } + + static final TeeSinkTokenFilter.SinkFilter theFilter = new TeeSinkTokenFilter.SinkFilter() { + @Override + public boolean accept(AttributeSource a) { + CharTermAttribute termAtt = a.getAttribute(CharTermAttribute.class); + return termAtt.toString().equalsIgnoreCase("The"); + } + }; + + static final TeeSinkTokenFilter.SinkFilter dogFilter = new TeeSinkTokenFilter.SinkFilter() { + @Override + public boolean accept(AttributeSource a) { + CharTermAttribute termAtt = a.getAttribute(CharTermAttribute.class); + return termAtt.toString().equalsIgnoreCase("Dogs"); + } + }; + + public void testGeneral() throws IOException { + final TeeSinkTokenFilter source = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false)); + final TokenStream sink1 = source.newSinkTokenStream(); + final TokenStream sink2 = source.newSinkTokenStream(theFilter); + + source.addAttribute(CheckClearAttributesAttribute.class); + sink1.addAttribute(CheckClearAttributesAttribute.class); + sink2.addAttribute(CheckClearAttributesAttribute.class); + + assertTokenStreamContents(source, tokens1); + assertTokenStreamContents(sink1, tokens1); + assertTokenStreamContents(sink2, new String[]{"The", "the"}); + } + + public void testMultipleSources() throws Exception { + final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false)); + final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter); + final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter); + tee1.reset(); + final TokenStream source1 = new CachingTokenFilter(tee1); + + tee1.addAttribute(CheckClearAttributesAttribute.class); + dogDetector.addAttribute(CheckClearAttributesAttribute.class); + theDetector.addAttribute(CheckClearAttributesAttribute.class); + + final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer2.toString()), MockTokenizer.WHITESPACE, false)); + tee2.addSinkTokenStream(dogDetector); + tee2.addSinkTokenStream(theDetector); + final TokenStream source2 = tee2; + + assertTokenStreamContents(source1, tokens1); + assertTokenStreamContents(source2, tokens2); + + assertTokenStreamContents(theDetector, new String[]{"The", "the", "The", "the"}); + assertTokenStreamContents(dogDetector, new String[]{"Dogs", "Dogs"}); + + source1.reset(); + TokenStream lowerCasing = new LowerCaseFilter(TEST_VERSION_CURRENT, source1); + String[] lowerCaseTokens = new String[tokens1.length]; + for (int i = 0; i < tokens1.length; i++) + lowerCaseTokens[i] = tokens1[i].toLowerCase(); + assertTokenStreamContents(lowerCasing, lowerCaseTokens); + } + + /** + * Not an explicit test, just useful to print out some info on performance + * + * @throws Exception + */ + public void performance() throws Exception { + int[] tokCount = {100, 500, 1000, 2000, 5000, 10000}; + int[] modCounts = {1, 2, 5, 10, 20, 50, 100, 200, 500}; + for (int k = 0; k < tokCount.length; k++) { + StringBuilder buffer = new StringBuilder(); + System.out.println("-----Tokens: " + tokCount[k] + "-----"); + for (int i = 0; i < tokCount[k]; i++) { + buffer.append(English.intToEnglish(i).toUpperCase()).append(' '); + } + //make sure we produce the same tokens + TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString())))); + TokenStream sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(100)); + teeStream.consumeAllTokens(); + TokenStream stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))), 100); + CharTermAttribute tfTok = stream.addAttribute(CharTermAttribute.class); + CharTermAttribute sinkTok = sink.addAttribute(CharTermAttribute.class); + for (int i=0; stream.incrementToken(); i++) { + assertTrue(sink.incrementToken()); + assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.equals(sinkTok) == true); + } + + //simulate two fields, each being analyzed once, for 20 documents + for (int j = 0; j < modCounts.length; j++) { + int tfPos = 0; + long start = System.currentTimeMillis(); + for (int i = 0; i < 20; i++) { + stream = new StandardFilter(new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))); + PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); + while (stream.incrementToken()) { + tfPos += posIncrAtt.getPositionIncrement(); + } + stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))), modCounts[j]); + posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); + while (stream.incrementToken()) { + tfPos += posIncrAtt.getPositionIncrement(); + } + } + long finish = System.currentTimeMillis(); + System.out.println("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms"); + int sinkPos = 0; + //simulate one field with one sink + start = System.currentTimeMillis(); + for (int i = 0; i < 20; i++) { + teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString())))); + sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(modCounts[j])); + PositionIncrementAttribute posIncrAtt = teeStream.getAttribute(PositionIncrementAttribute.class); + while (teeStream.incrementToken()) { + sinkPos += posIncrAtt.getPositionIncrement(); + } + //System.out.println("Modulo--------"); + posIncrAtt = sink.getAttribute(PositionIncrementAttribute.class); + while (sink.incrementToken()) { + sinkPos += posIncrAtt.getPositionIncrement(); + } + } + finish = System.currentTimeMillis(); + System.out.println("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms"); + assertTrue(sinkPos + " does not equal: " + tfPos, sinkPos == tfPos); + + } + System.out.println("- End Tokens: " + tokCount[k] + "-----"); + } + + } + + + class ModuloTokenFilter extends TokenFilter { + + int modCount; + + ModuloTokenFilter(TokenStream input, int mc) { + super(input); + modCount = mc; + } + + int count = 0; + + //return every 100 tokens + @Override + public boolean incrementToken() throws IOException { + boolean hasNext; + for (hasNext = input.incrementToken(); + hasNext && count % modCount != 0; + hasNext = input.incrementToken()) { + count++; + } + count++; + return hasNext; + } + } + + class ModuloSinkFilter extends TeeSinkTokenFilter.SinkFilter { + int count = 0; + int modCount; + + ModuloSinkFilter(int mc) { + modCount = mc; + } + + @Override + public boolean accept(AttributeSource a) { + boolean b = (a != null && count % modCount == 0); + count++; + return b; + } + + } +} +