1 package org.apache.lucene.analysis;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.StringReader;
21 import java.io.IOException;
22 import java.util.ArrayList;
23 import java.util.List;
24 import java.util.Random;
26 import org.apache.lucene.analysis.tokenattributes.*;
27 import org.apache.lucene.util.Attribute;
28 import org.apache.lucene.util.AttributeImpl;
29 import org.apache.lucene.util.LuceneTestCase;
30 import org.apache.lucene.util._TestUtil;
33 * Base class for all Lucene unit tests that use TokenStreams.
35 * When writing unit tests for analysis components, its highly recommended
36 * to use the helper methods here (especially in conjunction with {@link MockAnalyzer} or
37 * {@link MockTokenizer}), as they contain many assertions and checks to
43 public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
44 // some helpers to test Analyzers and TokenStreams:
46 public static interface CheckClearAttributesAttribute extends Attribute {
47 boolean getAndResetClearCalled();
50 public static final class CheckClearAttributesAttributeImpl extends AttributeImpl implements CheckClearAttributesAttribute {
51 private boolean clearCalled = false;
53 public boolean getAndResetClearCalled() {
67 public boolean equals(Object other) {
69 other instanceof CheckClearAttributesAttributeImpl &&
70 ((CheckClearAttributesAttributeImpl) other).clearCalled == this.clearCalled
75 public int hashCode() {
76 return 76137213 ^ Boolean.valueOf(clearCalled).hashCode();
80 public void copyTo(AttributeImpl target) {
81 ((CheckClearAttributesAttributeImpl) target).clear();
85 public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException {
86 assertNotNull(output);
87 CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
89 assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
90 CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
92 OffsetAttribute offsetAtt = null;
93 if (startOffsets != null || endOffsets != null || finalOffset != null) {
94 assertTrue("has no OffsetAttribute", ts.hasAttribute(OffsetAttribute.class));
95 offsetAtt = ts.getAttribute(OffsetAttribute.class);
98 TypeAttribute typeAtt = null;
100 assertTrue("has no TypeAttribute", ts.hasAttribute(TypeAttribute.class));
101 typeAtt = ts.getAttribute(TypeAttribute.class);
104 PositionIncrementAttribute posIncrAtt = null;
105 if (posIncrements != null) {
106 assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class));
107 posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class);
111 for (int i = 0; i < output.length; i++) {
112 // extra safety to enforce, that the state is not preserved and also assign bogus values
113 ts.clearAttributes();
114 termAtt.setEmpty().append("bogusTerm");
115 if (offsetAtt != null) offsetAtt.setOffset(14584724,24683243);
116 if (typeAtt != null) typeAtt.setType("bogusType");
117 if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);
119 checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before
120 assertTrue("token "+i+" does not exist", ts.incrementToken());
121 assertTrue("clearAttributes() was not called correctly in TokenStream chain", checkClearAtt.getAndResetClearCalled());
123 assertEquals("term "+i, output[i], termAtt.toString());
124 if (startOffsets != null)
125 assertEquals("startOffset "+i, startOffsets[i], offsetAtt.startOffset());
126 if (endOffsets != null)
127 assertEquals("endOffset "+i, endOffsets[i], offsetAtt.endOffset());
129 assertEquals("type "+i, types[i], typeAtt.type());
130 if (posIncrements != null)
131 assertEquals("posIncrement "+i, posIncrements[i], posIncrAtt.getPositionIncrement());
133 // we can enforce some basic things about a few attributes even if the caller doesn't check:
134 if (offsetAtt != null) {
135 assertTrue("startOffset must be >= 0", offsetAtt.startOffset() >= 0);
136 assertTrue("endOffset must be >= 0", offsetAtt.endOffset() >= 0);
137 assertTrue("endOffset must be >= startOffset", offsetAtt.endOffset() >= offsetAtt.startOffset());
139 if (posIncrAtt != null) {
140 assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
143 assertFalse("end of stream", ts.incrementToken());
145 if (finalOffset != null)
146 assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
147 if (offsetAtt != null) {
148 assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0);
153 public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
154 assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null);
157 public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException {
158 assertTokenStreamContents(ts, output, null, null, null, null, null);
161 public static void assertTokenStreamContents(TokenStream ts, String[] output, String[] types) throws IOException {
162 assertTokenStreamContents(ts, output, null, null, types, null, null);
165 public static void assertTokenStreamContents(TokenStream ts, String[] output, int[] posIncrements) throws IOException {
166 assertTokenStreamContents(ts, output, null, null, null, posIncrements, null);
169 public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[]) throws IOException {
170 assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null);
173 public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], Integer finalOffset) throws IOException {
174 assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, finalOffset);
177 public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException {
178 assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null);
181 public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements, Integer finalOffset) throws IOException {
182 assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, finalOffset);
185 public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
186 assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, input.length());
189 public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
190 assertAnalyzesTo(a, input, output, null, null, null, null);
193 public static void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types) throws IOException {
194 assertAnalyzesTo(a, input, output, null, null, types, null);
197 public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int[] posIncrements) throws IOException {
198 assertAnalyzesTo(a, input, output, null, null, null, posIncrements);
201 public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws IOException {
202 assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null);
205 public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException {
206 assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, posIncrements);
210 public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
211 assertTokenStreamContents(a.reusableTokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, input.length());
214 public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output) throws IOException {
215 assertAnalyzesToReuse(a, input, output, null, null, null, null);
218 public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, String[] types) throws IOException {
219 assertAnalyzesToReuse(a, input, output, null, null, types, null);
222 public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int[] posIncrements) throws IOException {
223 assertAnalyzesToReuse(a, input, output, null, null, null, posIncrements);
226 public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws IOException {
227 assertAnalyzesToReuse(a, input, output, startOffsets, endOffsets, null, null);
230 public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException {
231 assertAnalyzesToReuse(a, input, output, startOffsets, endOffsets, null, posIncrements);
234 // simple utility method for testing stemmers
236 public static void checkOneTerm(Analyzer a, final String input, final String expected) throws IOException {
237 assertAnalyzesTo(a, input, new String[]{expected});
240 public static void checkOneTermReuse(Analyzer a, final String input, final String expected) throws IOException {
241 assertAnalyzesToReuse(a, input, new String[]{expected});
244 // simple utility method for blasting tokenstreams with data to make sure they don't do anything crazy
246 public static void checkRandomData(Random random, Analyzer a, int iterations) throws IOException {
247 checkRandomData(random, a, iterations, 20);
250 public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
251 for (int i = 0; i < iterations; i++) {
253 switch(_TestUtil.nextInt(random, 0, 3)) {
255 text = _TestUtil.randomSimpleString(random);
258 text = _TestUtil.randomRealisticUnicodeString(random, maxWordLength);
261 text = _TestUtil.randomUnicodeString(random, maxWordLength);
264 TokenStream ts = a.reusableTokenStream("dummy", new StringReader(text));
265 assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
266 CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
267 List<String> tokens = new ArrayList<String>();
269 while (ts.incrementToken()) {
270 tokens.add(termAtt.toString());
271 // TODO: we could collect offsets etc here for better checking that reset() really works.
275 // verify reusing is "reproducable" and also get the normal tokenstream sanity checks
276 if (!tokens.isEmpty())
277 assertAnalyzesToReuse(a, text, tokens.toArray(new String[tokens.size()]));