1 package org.apache.lucene.analysis;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.StringReader;
21 import java.io.IOException;
22 import java.util.ArrayList;
23 import java.util.List;
24 import java.util.Random;
26 import org.apache.lucene.analysis.tokenattributes.*;
27 import org.apache.lucene.util.Attribute;
28 import org.apache.lucene.util.AttributeImpl;
29 import org.apache.lucene.util.LuceneTestCase;
30 import org.apache.lucene.util._TestUtil;
33 * Base class for all Lucene unit tests that use TokenStreams.
35 * When writing unit tests for analysis components, its highly recommended
36 * to use the helper methods here (especially in conjunction with {@link MockAnalyzer} or
37 * {@link MockTokenizer}), as they contain many assertions and checks to
43 public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
44 // some helpers to test Analyzers and TokenStreams:
46 public static interface CheckClearAttributesAttribute extends Attribute {
47 boolean getAndResetClearCalled();
50 public static final class CheckClearAttributesAttributeImpl extends AttributeImpl implements CheckClearAttributesAttribute {
51 private boolean clearCalled = false;
53 public boolean getAndResetClearCalled() {
67 public boolean equals(Object other) {
69 other instanceof CheckClearAttributesAttributeImpl &&
70 ((CheckClearAttributesAttributeImpl) other).clearCalled == this.clearCalled
75 public int hashCode() {
76 return 76137213 ^ Boolean.valueOf(clearCalled).hashCode();
80 public void copyTo(AttributeImpl target) {
81 ((CheckClearAttributesAttributeImpl) target).clear();
85 public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException {
86 assertNotNull(output);
87 CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
89 assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
90 CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
92 OffsetAttribute offsetAtt = null;
93 if (startOffsets != null || endOffsets != null || finalOffset != null) {
94 assertTrue("has no OffsetAttribute", ts.hasAttribute(OffsetAttribute.class));
95 offsetAtt = ts.getAttribute(OffsetAttribute.class);
98 TypeAttribute typeAtt = null;
100 assertTrue("has no TypeAttribute", ts.hasAttribute(TypeAttribute.class));
101 typeAtt = ts.getAttribute(TypeAttribute.class);
104 PositionIncrementAttribute posIncrAtt = null;
105 if (posIncrements != null) {
106 assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class));
107 posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class);
111 for (int i = 0; i < output.length; i++) {
112 // extra safety to enforce, that the state is not preserved and also assign bogus values
113 ts.clearAttributes();
114 termAtt.setEmpty().append("bogusTerm");
115 if (offsetAtt != null) offsetAtt.setOffset(14584724,24683243);
116 if (typeAtt != null) typeAtt.setType("bogusType");
117 if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);
119 checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before
120 assertTrue("token "+i+" does not exist", ts.incrementToken());
121 assertTrue("clearAttributes() was not called correctly in TokenStream chain", checkClearAtt.getAndResetClearCalled());
123 assertEquals("term "+i, output[i], termAtt.toString());
124 if (startOffsets != null)
125 assertEquals("startOffset "+i, startOffsets[i], offsetAtt.startOffset());
126 if (endOffsets != null)
127 assertEquals("endOffset "+i, endOffsets[i], offsetAtt.endOffset());
129 assertEquals("type "+i, types[i], typeAtt.type());
130 if (posIncrements != null)
131 assertEquals("posIncrement "+i, posIncrements[i], posIncrAtt.getPositionIncrement());
133 // we can enforce some basic things about a few attributes even if the caller doesn't check:
134 if (offsetAtt != null) {
135 assertTrue("startOffset must be >= 0", offsetAtt.startOffset() >= 0);
136 assertTrue("endOffset must be >= 0", offsetAtt.endOffset() >= 0);
137 assertTrue("endOffset must be >= startOffset", offsetAtt.endOffset() >= offsetAtt.startOffset());
139 if (posIncrAtt != null) {
140 assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
143 assertFalse("end of stream", ts.incrementToken());
145 if (finalOffset != null)
146 assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
147 if (offsetAtt != null) {
148 assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0);
153 public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
154 assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null);
157 public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException {
158 assertTokenStreamContents(ts, output, null, null, null, null, null);
161 public static void assertTokenStreamContents(TokenStream ts, String[] output, String[] types) throws IOException {
162 assertTokenStreamContents(ts, output, null, null, types, null, null);
165 public static void assertTokenStreamContents(TokenStream ts, String[] output, int[] posIncrements) throws IOException {
166 assertTokenStreamContents(ts, output, null, null, null, posIncrements, null);
169 public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[]) throws IOException {
170 assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null);
173 public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], Integer finalOffset) throws IOException {
174 assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, finalOffset);
177 public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException {
178 assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null);
181 public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements, Integer finalOffset) throws IOException {
182 assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, finalOffset);
185 public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
186 assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, input.length());
189 public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
190 assertAnalyzesTo(a, input, output, null, null, null, null);
193 public static void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types) throws IOException {
194 assertAnalyzesTo(a, input, output, null, null, types, null);
197 public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int[] posIncrements) throws IOException {
198 assertAnalyzesTo(a, input, output, null, null, null, posIncrements);
201 public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws IOException {
202 assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null);
205 public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException {
206 assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, posIncrements);
210 public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
211 assertTokenStreamContents(a.reusableTokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, input.length());
214 public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output) throws IOException {
215 assertAnalyzesToReuse(a, input, output, null, null, null, null);
218 public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, String[] types) throws IOException {
219 assertAnalyzesToReuse(a, input, output, null, null, types, null);
222 public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int[] posIncrements) throws IOException {
223 assertAnalyzesToReuse(a, input, output, null, null, null, posIncrements);
226 public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws IOException {
227 assertAnalyzesToReuse(a, input, output, startOffsets, endOffsets, null, null);
230 public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException {
231 assertAnalyzesToReuse(a, input, output, startOffsets, endOffsets, null, posIncrements);
234 // simple utility method for testing stemmers
236 public static void checkOneTerm(Analyzer a, final String input, final String expected) throws IOException {
237 assertAnalyzesTo(a, input, new String[]{expected});
240 public static void checkOneTermReuse(Analyzer a, final String input, final String expected) throws IOException {
241 assertAnalyzesToReuse(a, input, new String[]{expected});
244 // simple utility method for blasting tokenstreams with data to make sure they don't do anything crazy
246 public static void checkRandomData(Random random, Analyzer a, int iterations) throws IOException {
247 checkRandomData(random, a, iterations, 20);
250 public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
251 for (int i = 0; i < iterations; i++) {
253 switch(_TestUtil.nextInt(random, 0, 3)) {
255 text = _TestUtil.randomSimpleString(random);
258 text = _TestUtil.randomRealisticUnicodeString(random, maxWordLength);
261 text = _TestUtil.randomUnicodeString(random, maxWordLength);
265 System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
268 TokenStream ts = a.reusableTokenStream("dummy", new StringReader(text));
269 assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
270 CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
271 OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
272 PositionIncrementAttribute posIncAtt = ts.hasAttribute(PositionIncrementAttribute.class) ? ts.getAttribute(PositionIncrementAttribute.class) : null;
273 TypeAttribute typeAtt = ts.hasAttribute(TypeAttribute.class) ? ts.getAttribute(TypeAttribute.class) : null;
274 List<String> tokens = new ArrayList<String>();
275 List<String> types = new ArrayList<String>();
276 List<Integer> positions = new ArrayList<Integer>();
277 List<Integer> startOffsets = new ArrayList<Integer>();
278 List<Integer> endOffsets = new ArrayList<Integer>();
280 while (ts.incrementToken()) {
281 tokens.add(termAtt.toString());
282 if (typeAtt != null) types.add(typeAtt.type());
283 if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement());
284 if (offsetAtt != null) {
285 startOffsets.add(offsetAtt.startOffset());
286 endOffsets.add(offsetAtt.endOffset());
291 // verify reusing is "reproducable" and also get the normal tokenstream sanity checks
292 if (!tokens.isEmpty()) {
294 System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis");
296 if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
297 // offset + pos + type
298 assertAnalyzesToReuse(a, text,
299 tokens.toArray(new String[tokens.size()]),
300 toIntArray(startOffsets),
301 toIntArray(endOffsets),
302 types.toArray(new String[types.size()]),
303 toIntArray(positions));
304 } else if (posIncAtt != null && offsetAtt != null) {
306 assertAnalyzesToReuse(a, text,
307 tokens.toArray(new String[tokens.size()]),
308 toIntArray(startOffsets),
309 toIntArray(endOffsets),
310 toIntArray(positions));
311 } else if (offsetAtt != null) {
313 assertAnalyzesToReuse(a, text,
314 tokens.toArray(new String[tokens.size()]),
315 toIntArray(startOffsets),
316 toIntArray(endOffsets));
319 assertAnalyzesToReuse(a, text,
320 tokens.toArray(new String[tokens.size()]));
326 static int[] toIntArray(List<Integer> list) {
327 int ret[] = new int[list.size()];
329 for (Integer i : list) {