1 package org.apache.lucene.analysis.shingle;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.io.StringReader;
23 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
24 import org.apache.lucene.analysis.Token;
25 import org.apache.lucene.analysis.TokenStream;
26 import org.apache.lucene.analysis.Tokenizer;
27 import org.apache.lucene.analysis.WhitespaceTokenizer;
28 import org.apache.lucene.analysis.tokenattributes.*;
30 public class ShingleFilterTest extends BaseTokenStreamTestCase {
32 public class TestTokenStream extends TokenStream {
34 protected int index = 0;
35 protected Token[] testToken;
37 private CharTermAttribute termAtt;
38 private OffsetAttribute offsetAtt;
39 private PositionIncrementAttribute posIncrAtt;
40 private TypeAttribute typeAtt;
42 public TestTokenStream(Token[] testToken) {
44 this.testToken = testToken;
45 this.termAtt = addAttribute(CharTermAttribute.class);
46 this.offsetAtt = addAttribute(OffsetAttribute.class);
47 this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
48 this.typeAtt = addAttribute(TypeAttribute.class);
52 public final boolean incrementToken() throws IOException {
54 if (index < testToken.length) {
55 Token t = testToken[index++];
56 termAtt.copyBuffer(t.buffer(), 0, t.length());
57 offsetAtt.setOffset(t.startOffset(), t.endOffset());
58 posIncrAtt.setPositionIncrement(t.getPositionIncrement());
59 typeAtt.setType(TypeAttributeImpl.DEFAULT_TYPE);
67 public static final Token[] TEST_TOKEN = new Token[] {
68 createToken("please", 0, 6),
69 createToken("divide", 7, 13),
70 createToken("this", 14, 18),
71 createToken("sentence", 19, 27),
72 createToken("into", 28, 32),
73 createToken("shingles", 33, 39),
76 public static final int[] UNIGRAM_ONLY_POSITION_INCREMENTS = new int[] {
80 public static final String[] UNIGRAM_ONLY_TYPES = new String[] {
81 "word", "word", "word", "word", "word", "word"
84 public static Token[] testTokenWithHoles;
86 public static final Token[] BI_GRAM_TOKENS = new Token[] {
87 createToken("please", 0, 6),
88 createToken("please divide", 0, 13),
89 createToken("divide", 7, 13),
90 createToken("divide this", 7, 18),
91 createToken("this", 14, 18),
92 createToken("this sentence", 14, 27),
93 createToken("sentence", 19, 27),
94 createToken("sentence into", 19, 32),
95 createToken("into", 28, 32),
96 createToken("into shingles", 28, 39),
97 createToken("shingles", 33, 39),
100 public static final int[] BI_GRAM_POSITION_INCREMENTS = new int[] {
101 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
104 public static final String[] BI_GRAM_TYPES = new String[] {
105 "word", "shingle", "word", "shingle", "word", "shingle", "word",
106 "shingle", "word", "shingle", "word"
109 public static final Token[] BI_GRAM_TOKENS_WITH_HOLES = new Token[] {
110 createToken("please", 0, 6),
111 createToken("please divide", 0, 13),
112 createToken("divide", 7, 13),
113 createToken("divide _", 7, 19),
114 createToken("_ sentence", 19, 27),
115 createToken("sentence", 19, 27),
116 createToken("sentence _", 19, 33),
117 createToken("_ shingles", 33, 39),
118 createToken("shingles", 33, 39),
121 public static final int[] BI_GRAM_POSITION_INCREMENTS_WITH_HOLES = new int[] {
122 1, 0, 1, 0, 1, 1, 0, 1, 1
125 private static final String[] BI_GRAM_TYPES_WITH_HOLES = {
127 "word", "shingle", "shingle", "word", "shingle", "shingle", "word"
130 public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
131 createToken("please divide", 0, 13),
132 createToken("divide this", 7, 18),
133 createToken("this sentence", 14, 27),
134 createToken("sentence into", 19, 32),
135 createToken("into shingles", 28, 39),
138 public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
142 public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
143 "shingle", "shingle", "shingle", "shingle", "shingle"
146 public static final Token[] BI_GRAM_TOKENS_WITH_HOLES_WITHOUT_UNIGRAMS = new Token[] {
147 createToken("please divide", 0, 13),
148 createToken("divide _", 7, 19),
149 createToken("_ sentence", 19, 27),
150 createToken("sentence _", 19, 33),
151 createToken("_ shingles", 33, 39),
154 public static final int[] BI_GRAM_POSITION_INCREMENTS_WITH_HOLES_WITHOUT_UNIGRAMS = new int[] {
159 public static final Token[] TEST_SINGLE_TOKEN = new Token[] {
160 createToken("please", 0, 6)
163 public static final Token[] SINGLE_TOKEN = new Token[] {
164 createToken("please", 0, 6)
167 public static final int[] SINGLE_TOKEN_INCREMENTS = new int[] {
171 public static final String[] SINGLE_TOKEN_TYPES = new String[] {
175 public static final Token[] EMPTY_TOKEN_ARRAY = new Token[] {
178 public static final int[] EMPTY_TOKEN_INCREMENTS_ARRAY = new int[] {
181 public static final String[] EMPTY_TOKEN_TYPES_ARRAY = new String[] {
184 public static final Token[] TRI_GRAM_TOKENS = new Token[] {
185 createToken("please", 0, 6),
186 createToken("please divide", 0, 13),
187 createToken("please divide this", 0, 18),
188 createToken("divide", 7, 13),
189 createToken("divide this", 7, 18),
190 createToken("divide this sentence", 7, 27),
191 createToken("this", 14, 18),
192 createToken("this sentence", 14, 27),
193 createToken("this sentence into", 14, 32),
194 createToken("sentence", 19, 27),
195 createToken("sentence into", 19, 32),
196 createToken("sentence into shingles", 19, 39),
197 createToken("into", 28, 32),
198 createToken("into shingles", 28, 39),
199 createToken("shingles", 33, 39)
202 public static final int[] TRI_GRAM_POSITION_INCREMENTS = new int[] {
203 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
206 public static final String[] TRI_GRAM_TYPES = new String[] {
207 "word", "shingle", "shingle",
208 "word", "shingle", "shingle",
209 "word", "shingle", "shingle",
210 "word", "shingle", "shingle",
215 public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
216 createToken("please divide", 0, 13),
217 createToken("please divide this", 0, 18),
218 createToken("divide this", 7, 18),
219 createToken("divide this sentence", 7, 27),
220 createToken("this sentence", 14, 27),
221 createToken("this sentence into", 14, 32),
222 createToken("sentence into", 19, 32),
223 createToken("sentence into shingles", 19, 39),
224 createToken("into shingles", 28, 39),
227 public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
228 1, 0, 1, 0, 1, 0, 1, 0, 1
231 public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
232 "shingle", "shingle",
233 "shingle", "shingle",
234 "shingle", "shingle",
235 "shingle", "shingle",
239 public static final Token[] FOUR_GRAM_TOKENS = new Token[] {
240 createToken("please", 0, 6),
241 createToken("please divide", 0, 13),
242 createToken("please divide this", 0, 18),
243 createToken("please divide this sentence", 0, 27),
244 createToken("divide", 7, 13),
245 createToken("divide this", 7, 18),
246 createToken("divide this sentence", 7, 27),
247 createToken("divide this sentence into", 7, 32),
248 createToken("this", 14, 18),
249 createToken("this sentence", 14, 27),
250 createToken("this sentence into", 14, 32),
251 createToken("this sentence into shingles", 14, 39),
252 createToken("sentence", 19, 27),
253 createToken("sentence into", 19, 32),
254 createToken("sentence into shingles", 19, 39),
255 createToken("into", 28, 32),
256 createToken("into shingles", 28, 39),
257 createToken("shingles", 33, 39)
260 public static final int[] FOUR_GRAM_POSITION_INCREMENTS = new int[] {
261 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1
264 public static final String[] FOUR_GRAM_TYPES = new String[] {
265 "word", "shingle", "shingle", "shingle",
266 "word", "shingle", "shingle", "shingle",
267 "word", "shingle", "shingle", "shingle",
268 "word", "shingle", "shingle",
273 public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
274 createToken("please divide", 0, 13),
275 createToken("please divide this", 0, 18),
276 createToken("please divide this sentence", 0, 27),
277 createToken("divide this", 7, 18),
278 createToken("divide this sentence", 7, 27),
279 createToken("divide this sentence into", 7, 32),
280 createToken("this sentence", 14, 27),
281 createToken("this sentence into", 14, 32),
282 createToken("this sentence into shingles", 14, 39),
283 createToken("sentence into", 19, 32),
284 createToken("sentence into shingles", 19, 39),
285 createToken("into shingles", 28, 39),
288 public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
289 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
292 public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
293 "shingle", "shingle",
294 "shingle", "shingle",
295 "shingle", "shingle",
296 "shingle", "shingle",
297 "shingle", "shingle",
298 "shingle", "shingle",
302 public static final Token[] TRI_GRAM_TOKENS_MIN_TRI_GRAM = new Token[] {
303 createToken("please", 0, 6),
304 createToken("please divide this", 0, 18),
305 createToken("divide", 7, 13),
306 createToken("divide this sentence", 7, 27),
307 createToken("this", 14, 18),
308 createToken("this sentence into", 14, 32),
309 createToken("sentence", 19, 27),
310 createToken("sentence into shingles", 19, 39),
311 createToken("into", 28, 32),
312 createToken("shingles", 33, 39)
315 public static final int[] TRI_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM = new int[] {
316 1, 0, 1, 0, 1, 0, 1, 0, 1, 1
319 public static final String[] TRI_GRAM_TYPES_MIN_TRI_GRAM = new String[] {
328 public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new Token[] {
329 createToken("please divide this", 0, 18),
330 createToken("divide this sentence", 7, 27),
331 createToken("this sentence into", 14, 32),
332 createToken("sentence into shingles", 19, 39)
335 public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new int[] {
339 public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new String[] {
346 public static final Token[] FOUR_GRAM_TOKENS_MIN_TRI_GRAM = new Token[] {
347 createToken("please", 0, 6),
348 createToken("please divide this", 0, 18),
349 createToken("please divide this sentence", 0, 27),
350 createToken("divide", 7, 13),
351 createToken("divide this sentence", 7, 27),
352 createToken("divide this sentence into", 7, 32),
353 createToken("this", 14, 18),
354 createToken("this sentence into", 14, 32),
355 createToken("this sentence into shingles", 14, 39),
356 createToken("sentence", 19, 27),
357 createToken("sentence into shingles", 19, 39),
358 createToken("into", 28, 32),
359 createToken("shingles", 33, 39)
362 public static final int[] FOUR_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM = new int[] {
363 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1
366 public static final String[] FOUR_GRAM_TYPES_MIN_TRI_GRAM = new String[] {
367 "word", "shingle", "shingle",
368 "word", "shingle", "shingle",
369 "word", "shingle", "shingle",
375 public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new Token[] {
376 createToken("please divide this", 0, 18),
377 createToken("please divide this sentence", 0, 27),
378 createToken("divide this sentence", 7, 27),
379 createToken("divide this sentence into", 7, 32),
380 createToken("this sentence into", 14, 32),
381 createToken("this sentence into shingles", 14, 39),
382 createToken("sentence into shingles", 19, 39),
385 public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new int[] {
389 public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new String[] {
390 "shingle", "shingle",
391 "shingle", "shingle",
392 "shingle", "shingle",
396 public static final Token[] FOUR_GRAM_TOKENS_MIN_FOUR_GRAM = new Token[] {
397 createToken("please", 0, 6),
398 createToken("please divide this sentence", 0, 27),
399 createToken("divide", 7, 13),
400 createToken("divide this sentence into", 7, 32),
401 createToken("this", 14, 18),
402 createToken("this sentence into shingles", 14, 39),
403 createToken("sentence", 19, 27),
404 createToken("into", 28, 32),
405 createToken("shingles", 33, 39)
408 public static final int[] FOUR_GRAM_POSITION_INCREMENTS_MIN_FOUR_GRAM = new int[] {
409 1, 0, 1, 0, 1, 0, 1, 1, 1
412 public static final String[] FOUR_GRAM_TYPES_MIN_FOUR_GRAM = new String[] {
421 public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM = new Token[] {
422 createToken("please divide this sentence", 0, 27),
423 createToken("divide this sentence into", 7, 32),
424 createToken("this sentence into shingles", 14, 39),
427 public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM = new int[] {
431 public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM = new String[] {
437 public static final Token[] BI_GRAM_TOKENS_NO_SEPARATOR = new Token[] {
438 createToken("please", 0, 6),
439 createToken("pleasedivide", 0, 13),
440 createToken("divide", 7, 13),
441 createToken("dividethis", 7, 18),
442 createToken("this", 14, 18),
443 createToken("thissentence", 14, 27),
444 createToken("sentence", 19, 27),
445 createToken("sentenceinto", 19, 32),
446 createToken("into", 28, 32),
447 createToken("intoshingles", 28, 39),
448 createToken("shingles", 33, 39),
451 public static final int[] BI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR = new int[] {
452 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
455 public static final String[] BI_GRAM_TYPES_NO_SEPARATOR = new String[] {
456 "word", "shingle", "word", "shingle", "word", "shingle", "word",
457 "shingle", "word", "shingle", "word"
460 public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new Token[] {
461 createToken("pleasedivide", 0, 13),
462 createToken("dividethis", 7, 18),
463 createToken("thissentence", 14, 27),
464 createToken("sentenceinto", 19, 32),
465 createToken("intoshingles", 28, 39),
468 public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new int[] {
472 public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR = new String[] {
473 "shingle", "shingle", "shingle", "shingle", "shingle"
476 public static final Token[] TRI_GRAM_TOKENS_NO_SEPARATOR = new Token[] {
477 createToken("please", 0, 6),
478 createToken("pleasedivide", 0, 13),
479 createToken("pleasedividethis", 0, 18),
480 createToken("divide", 7, 13),
481 createToken("dividethis", 7, 18),
482 createToken("dividethissentence", 7, 27),
483 createToken("this", 14, 18),
484 createToken("thissentence", 14, 27),
485 createToken("thissentenceinto", 14, 32),
486 createToken("sentence", 19, 27),
487 createToken("sentenceinto", 19, 32),
488 createToken("sentenceintoshingles", 19, 39),
489 createToken("into", 28, 32),
490 createToken("intoshingles", 28, 39),
491 createToken("shingles", 33, 39)
494 public static final int[] TRI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR = new int[] {
495 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
498 public static final String[] TRI_GRAM_TYPES_NO_SEPARATOR = new String[] {
499 "word", "shingle", "shingle",
500 "word", "shingle", "shingle",
501 "word", "shingle", "shingle",
502 "word", "shingle", "shingle",
507 public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new Token[] {
508 createToken("pleasedivide", 0, 13),
509 createToken("pleasedividethis", 0, 18),
510 createToken("dividethis", 7, 18),
511 createToken("dividethissentence", 7, 27),
512 createToken("thissentence", 14, 27),
513 createToken("thissentenceinto", 14, 32),
514 createToken("sentenceinto", 19, 32),
515 createToken("sentenceintoshingles", 19, 39),
516 createToken("intoshingles", 28, 39),
519 public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new int[] {
520 1, 0, 1, 0, 1, 0, 1, 0, 1
523 public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR = new String[] {
524 "shingle", "shingle",
525 "shingle", "shingle",
526 "shingle", "shingle",
527 "shingle", "shingle",
531 public static final Token[] BI_GRAM_TOKENS_ALT_SEPARATOR = new Token[] {
532 createToken("please", 0, 6),
533 createToken("please<SEP>divide", 0, 13),
534 createToken("divide", 7, 13),
535 createToken("divide<SEP>this", 7, 18),
536 createToken("this", 14, 18),
537 createToken("this<SEP>sentence", 14, 27),
538 createToken("sentence", 19, 27),
539 createToken("sentence<SEP>into", 19, 32),
540 createToken("into", 28, 32),
541 createToken("into<SEP>shingles", 28, 39),
542 createToken("shingles", 33, 39),
545 public static final int[] BI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR = new int[] {
546 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
549 public static final String[] BI_GRAM_TYPES_ALT_SEPARATOR = new String[] {
550 "word", "shingle", "word", "shingle", "word", "shingle", "word",
551 "shingle", "word", "shingle", "word"
554 public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new Token[] {
555 createToken("please<SEP>divide", 0, 13),
556 createToken("divide<SEP>this", 7, 18),
557 createToken("this<SEP>sentence", 14, 27),
558 createToken("sentence<SEP>into", 19, 32),
559 createToken("into<SEP>shingles", 28, 39),
562 public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new int[] {
566 public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new String[] {
567 "shingle", "shingle", "shingle", "shingle", "shingle"
570 public static final Token[] TRI_GRAM_TOKENS_ALT_SEPARATOR = new Token[] {
571 createToken("please", 0, 6),
572 createToken("please<SEP>divide", 0, 13),
573 createToken("please<SEP>divide<SEP>this", 0, 18),
574 createToken("divide", 7, 13),
575 createToken("divide<SEP>this", 7, 18),
576 createToken("divide<SEP>this<SEP>sentence", 7, 27),
577 createToken("this", 14, 18),
578 createToken("this<SEP>sentence", 14, 27),
579 createToken("this<SEP>sentence<SEP>into", 14, 32),
580 createToken("sentence", 19, 27),
581 createToken("sentence<SEP>into", 19, 32),
582 createToken("sentence<SEP>into<SEP>shingles", 19, 39),
583 createToken("into", 28, 32),
584 createToken("into<SEP>shingles", 28, 39),
585 createToken("shingles", 33, 39)
588 public static final int[] TRI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR = new int[] {
589 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
592 public static final String[] TRI_GRAM_TYPES_ALT_SEPARATOR = new String[] {
593 "word", "shingle", "shingle",
594 "word", "shingle", "shingle",
595 "word", "shingle", "shingle",
596 "word", "shingle", "shingle",
601 public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new Token[] {
602 createToken("please<SEP>divide", 0, 13),
603 createToken("please<SEP>divide<SEP>this", 0, 18),
604 createToken("divide<SEP>this", 7, 18),
605 createToken("divide<SEP>this<SEP>sentence", 7, 27),
606 createToken("this<SEP>sentence", 14, 27),
607 createToken("this<SEP>sentence<SEP>into", 14, 32),
608 createToken("sentence<SEP>into", 19, 32),
609 createToken("sentence<SEP>into<SEP>shingles", 19, 39),
610 createToken("into<SEP>shingles", 28, 39),
613 public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new int[] {
614 1, 0, 1, 0, 1, 0, 1, 0, 1
617 public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new String[] {
618 "shingle", "shingle",
619 "shingle", "shingle",
620 "shingle", "shingle",
621 "shingle", "shingle",
625 public static final Token[] TRI_GRAM_TOKENS_NULL_SEPARATOR = new Token[] {
626 createToken("please", 0, 6),
627 createToken("pleasedivide", 0, 13),
628 createToken("pleasedividethis", 0, 18),
629 createToken("divide", 7, 13),
630 createToken("dividethis", 7, 18),
631 createToken("dividethissentence", 7, 27),
632 createToken("this", 14, 18),
633 createToken("thissentence", 14, 27),
634 createToken("thissentenceinto", 14, 32),
635 createToken("sentence", 19, 27),
636 createToken("sentenceinto", 19, 32),
637 createToken("sentenceintoshingles", 19, 39),
638 createToken("into", 28, 32),
639 createToken("intoshingles", 28, 39),
640 createToken("shingles", 33, 39)
643 public static final int[] TRI_GRAM_POSITION_INCREMENTS_NULL_SEPARATOR = new int[] {
644 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
647 public static final String[] TRI_GRAM_TYPES_NULL_SEPARATOR = new String[] {
648 "word", "shingle", "shingle",
649 "word", "shingle", "shingle",
650 "word", "shingle", "shingle",
651 "word", "shingle", "shingle",
656 public static final Token[] TEST_TOKEN_POS_INCR_EQUAL_TO_N = new Token[] {
657 createToken("please", 0, 6),
658 createToken("divide", 7, 13),
659 createToken("this", 14, 18),
660 createToken("sentence", 29, 37, 3),
661 createToken("into", 38, 42),
662 createToken("shingles", 43, 49),
665 public static final Token[] TRI_GRAM_TOKENS_POS_INCR_EQUAL_TO_N = new Token[] {
666 createToken("please", 0, 6),
667 createToken("please divide", 0, 13),
668 createToken("please divide this", 0, 18),
669 createToken("divide", 7, 13),
670 createToken("divide this", 7, 18),
671 createToken("divide this _", 7, 29),
672 createToken("this", 14, 18),
673 createToken("this _", 14, 29),
674 createToken("this _ _", 14, 29),
675 createToken("_ _ sentence", 29, 37),
676 createToken("_ sentence", 29, 37),
677 createToken("_ sentence into", 29, 42),
678 createToken("sentence", 29, 37),
679 createToken("sentence into", 29, 42),
680 createToken("sentence into shingles", 29, 49),
681 createToken("into", 38, 42),
682 createToken("into shingles", 38, 49),
683 createToken("shingles", 43, 49)
686 public static final int[] TRI_GRAM_POSITION_INCREMENTS_POS_INCR_EQUAL_TO_N = new int[] {
687 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1
690 public static final String[] TRI_GRAM_TYPES_POS_INCR_EQUAL_TO_N = new String[] {
691 "word", "shingle", "shingle",
692 "word", "shingle", "shingle",
693 "word", "shingle", "shingle",
694 "shingle", "shingle", "shingle", "word", "shingle", "shingle",
699 public static final Token[] TRI_GRAM_TOKENS_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS = new Token[] {
700 createToken("please divide", 0, 13),
701 createToken("please divide this", 0, 18),
702 createToken("divide this", 7, 18),
703 createToken("divide this _", 7, 29),
704 createToken("this _", 14, 29),
705 createToken("this _ _", 14, 29),
706 createToken("_ _ sentence", 29, 37),
707 createToken("_ sentence", 29, 37),
708 createToken("_ sentence into", 29, 42),
709 createToken("sentence into", 29, 42),
710 createToken("sentence into shingles", 29, 49),
711 createToken("into shingles", 38, 49),
714 public static final int[] TRI_GRAM_POSITION_INCREMENTS_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS = new int[] {
715 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1
718 public static final String[] TRI_GRAM_TYPES_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS = new String[] {
719 "shingle", "shingle",
720 "shingle", "shingle",
721 "shingle", "shingle",
722 "shingle", "shingle", "shingle",
723 "shingle", "shingle",
727 public static final Token[] TEST_TOKEN_POS_INCR_GREATER_THAN_N = new Token[] {
728 createToken("please", 0, 6),
729 createToken("divide", 57, 63, 8),
730 createToken("this", 64, 68),
731 createToken("sentence", 69, 77),
732 createToken("into", 78, 82),
733 createToken("shingles", 83, 89),
736 public static final Token[] TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N = new Token[] {
737 createToken("please", 0, 6),
738 createToken("please _", 0, 57),
739 createToken("please _ _", 0, 57),
740 createToken("_ _ divide", 57, 63),
741 createToken("_ divide", 57, 63),
742 createToken("_ divide this", 57, 68),
743 createToken("divide", 57, 63),
744 createToken("divide this", 57, 68),
745 createToken("divide this sentence", 57, 77),
746 createToken("this", 64, 68),
747 createToken("this sentence", 64, 77),
748 createToken("this sentence into", 64, 82),
749 createToken("sentence", 69, 77),
750 createToken("sentence into", 69, 82),
751 createToken("sentence into shingles", 69, 89),
752 createToken("into", 78, 82),
753 createToken("into shingles", 78, 89),
754 createToken("shingles", 83, 89)
757 public static final int[] TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N = new int[] {
758 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
760 public static final String[] TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N = new String[] {
761 "word", "shingle", "shingle",
763 "shingle", "shingle",
764 "word", "shingle", "shingle",
765 "word", "shingle", "shingle",
766 "word", "shingle", "shingle",
771 public static final Token[] TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS = new Token[] {
772 createToken("please _", 0, 57),
773 createToken("please _ _", 0, 57),
774 createToken("_ _ divide", 57, 63),
775 createToken("_ divide", 57, 63),
776 createToken("_ divide this", 57, 68),
777 createToken("divide this", 57, 68),
778 createToken("divide this sentence", 57, 77),
779 createToken("this sentence", 64, 77),
780 createToken("this sentence into", 64, 82),
781 createToken("sentence into", 69, 82),
782 createToken("sentence into shingles", 69, 89),
783 createToken("into shingles", 78, 89),
786 public static final int[] TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS = new int[] {
787 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1
790 public static final String[] TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS = new String[] {
791 "shingle", "shingle",
792 "shingle", "shingle",
793 "shingle", "shingle",
794 "shingle", "shingle", "shingle", "shingle", "shingle",
799 public void setUp() throws Exception {
801 testTokenWithHoles = new Token[] {
802 createToken("please", 0, 6),
803 createToken("divide", 7, 13),
804 createToken("sentence", 19, 27, 2),
805 createToken("shingles", 33, 39, 2),
810 * Class under test for void ShingleFilter(TokenStream, int)
812 public void testBiGramFilter() throws IOException {
813 this.shingleFilterTest(2, TEST_TOKEN, BI_GRAM_TOKENS,
814 BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES,
818 public void testBiGramFilterWithHoles() throws IOException {
819 this.shingleFilterTest(2, testTokenWithHoles, BI_GRAM_TOKENS_WITH_HOLES,
820 BI_GRAM_POSITION_INCREMENTS_WITH_HOLES,
821 BI_GRAM_TYPES_WITH_HOLES,
825 public void testBiGramFilterWithoutUnigrams() throws IOException {
826 this.shingleFilterTest(2, TEST_TOKEN, BI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
827 BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
831 public void testBiGramFilterWithHolesWithoutUnigrams() throws IOException {
832 this.shingleFilterTest(2, testTokenWithHoles, BI_GRAM_TOKENS_WITH_HOLES_WITHOUT_UNIGRAMS,
833 BI_GRAM_POSITION_INCREMENTS_WITH_HOLES_WITHOUT_UNIGRAMS, BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
837 public void testBiGramFilterWithSingleToken() throws IOException {
838 this.shingleFilterTest(2, TEST_SINGLE_TOKEN, SINGLE_TOKEN,
839 SINGLE_TOKEN_INCREMENTS, SINGLE_TOKEN_TYPES,
843 public void testBiGramFilterWithSingleTokenWithoutUnigrams() throws IOException {
844 this.shingleFilterTest(2, TEST_SINGLE_TOKEN, EMPTY_TOKEN_ARRAY,
845 EMPTY_TOKEN_INCREMENTS_ARRAY, EMPTY_TOKEN_TYPES_ARRAY,
849 public void testBiGramFilterWithEmptyTokenStream() throws IOException {
850 this.shingleFilterTest(2, EMPTY_TOKEN_ARRAY, EMPTY_TOKEN_ARRAY,
851 EMPTY_TOKEN_INCREMENTS_ARRAY, EMPTY_TOKEN_TYPES_ARRAY,
855 public void testBiGramFilterWithEmptyTokenStreamWithoutUnigrams() throws IOException {
856 this.shingleFilterTest(2, EMPTY_TOKEN_ARRAY, EMPTY_TOKEN_ARRAY,
857 EMPTY_TOKEN_INCREMENTS_ARRAY, EMPTY_TOKEN_TYPES_ARRAY,
861 public void testTriGramFilter() throws IOException {
862 this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS,
863 TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES,
867 public void testTriGramFilterWithoutUnigrams() throws IOException {
868 this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
869 TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, TRI_GRAM_TYPES_WITHOUT_UNIGRAMS,
873 public void testFourGramFilter() throws IOException {
874 this.shingleFilterTest(4, TEST_TOKEN, FOUR_GRAM_TOKENS,
875 FOUR_GRAM_POSITION_INCREMENTS, FOUR_GRAM_TYPES,
879 public void testFourGramFilterWithoutUnigrams() throws IOException {
880 this.shingleFilterTest(4, TEST_TOKEN, FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS,
881 FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS,
882 FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS, false);
886 public void testTriGramFilterMinTriGram() throws IOException {
887 this.shingleFilterTest(3, 3, TEST_TOKEN, TRI_GRAM_TOKENS_MIN_TRI_GRAM,
888 TRI_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM,
889 TRI_GRAM_TYPES_MIN_TRI_GRAM,
893 public void testTriGramFilterWithoutUnigramsMinTriGram() throws IOException {
894 this.shingleFilterTest(3, 3, TEST_TOKEN,
895 TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
896 TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
897 TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
901 public void testFourGramFilterMinTriGram() throws IOException {
902 this.shingleFilterTest(3, 4, TEST_TOKEN, FOUR_GRAM_TOKENS_MIN_TRI_GRAM,
903 FOUR_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM,
904 FOUR_GRAM_TYPES_MIN_TRI_GRAM,
908 public void testFourGramFilterWithoutUnigramsMinTriGram() throws IOException {
909 this.shingleFilterTest(3, 4, TEST_TOKEN,
910 FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
911 FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
912 FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM, false);
915 public void testFourGramFilterMinFourGram() throws IOException {
916 this.shingleFilterTest(4, 4, TEST_TOKEN, FOUR_GRAM_TOKENS_MIN_FOUR_GRAM,
917 FOUR_GRAM_POSITION_INCREMENTS_MIN_FOUR_GRAM,
918 FOUR_GRAM_TYPES_MIN_FOUR_GRAM,
922 public void testFourGramFilterWithoutUnigramsMinFourGram() throws IOException {
923 this.shingleFilterTest(4, 4, TEST_TOKEN,
924 FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM,
925 FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM,
926 FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM, false);
929 public void testBiGramFilterNoSeparator() throws IOException {
930 this.shingleFilterTest("", 2, 2, TEST_TOKEN, BI_GRAM_TOKENS_NO_SEPARATOR,
931 BI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR,
932 BI_GRAM_TYPES_NO_SEPARATOR, true);
935 public void testBiGramFilterWithoutUnigramsNoSeparator() throws IOException {
936 this.shingleFilterTest("", 2, 2, TEST_TOKEN,
937 BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
938 BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
939 BI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR,
942 public void testTriGramFilterNoSeparator() throws IOException {
943 this.shingleFilterTest("", 2, 3, TEST_TOKEN, TRI_GRAM_TOKENS_NO_SEPARATOR,
944 TRI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR,
945 TRI_GRAM_TYPES_NO_SEPARATOR, true);
948 public void testTriGramFilterWithoutUnigramsNoSeparator() throws IOException {
949 this.shingleFilterTest("", 2, 3, TEST_TOKEN,
950 TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
951 TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
952 TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR, false);
955 public void testBiGramFilterAltSeparator() throws IOException {
956 this.shingleFilterTest("<SEP>", 2, 2, TEST_TOKEN, BI_GRAM_TOKENS_ALT_SEPARATOR,
957 BI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR,
958 BI_GRAM_TYPES_ALT_SEPARATOR, true);
961 public void testBiGramFilterWithoutUnigramsAltSeparator() throws IOException {
962 this.shingleFilterTest("<SEP>", 2, 2, TEST_TOKEN,
963 BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
964 BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
965 BI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
968 public void testTriGramFilterAltSeparator() throws IOException {
969 this.shingleFilterTest("<SEP>", 2, 3, TEST_TOKEN, TRI_GRAM_TOKENS_ALT_SEPARATOR,
970 TRI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR,
971 TRI_GRAM_TYPES_ALT_SEPARATOR, true);
974 public void testTriGramFilterWithoutUnigramsAltSeparator() throws IOException {
975 this.shingleFilterTest("<SEP>", 2, 3, TEST_TOKEN,
976 TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
977 TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
978 TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR, false);
981 public void testTriGramFilterNullSeparator() throws IOException {
982 this.shingleFilterTest(null, 2, 3, TEST_TOKEN, TRI_GRAM_TOKENS_NULL_SEPARATOR,
983 TRI_GRAM_POSITION_INCREMENTS_NULL_SEPARATOR,
984 TRI_GRAM_TYPES_NULL_SEPARATOR, true);
987 public void testPositionIncrementEqualToN() throws IOException {
988 this.shingleFilterTest(2, 3, TEST_TOKEN_POS_INCR_EQUAL_TO_N, TRI_GRAM_TOKENS_POS_INCR_EQUAL_TO_N,
989 TRI_GRAM_POSITION_INCREMENTS_POS_INCR_EQUAL_TO_N,
990 TRI_GRAM_TYPES_POS_INCR_EQUAL_TO_N, true);
993 public void testPositionIncrementEqualToNWithoutUnigrams() throws IOException {
994 this.shingleFilterTest(2, 3, TEST_TOKEN_POS_INCR_EQUAL_TO_N, TRI_GRAM_TOKENS_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS,
995 TRI_GRAM_POSITION_INCREMENTS_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS,
996 TRI_GRAM_TYPES_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS, false);
1000 public void testPositionIncrementGreaterThanN() throws IOException {
1001 this.shingleFilterTest(2, 3, TEST_TOKEN_POS_INCR_GREATER_THAN_N, TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N,
1002 TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N,
1003 TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N, true);
1006 public void testPositionIncrementGreaterThanNWithoutUnigrams() throws IOException {
1007 this.shingleFilterTest(2, 3, TEST_TOKEN_POS_INCR_GREATER_THAN_N, TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS,
1008 TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS,
1009 TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS, false);
1012 public void testReset() throws Exception {
1013 Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("please divide this sentence"));
1014 TokenStream filter = new ShingleFilter(wsTokenizer, 2);
1015 assertTokenStreamContents(filter,
1016 new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"},
1017 new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27},
1018 new String[]{TypeAttributeImpl.DEFAULT_TYPE,"shingle",TypeAttributeImpl.DEFAULT_TYPE,"shingle",TypeAttributeImpl.DEFAULT_TYPE,"shingle",TypeAttributeImpl.DEFAULT_TYPE},
1019 new int[]{1,0,1,0,1,0,1}
1021 wsTokenizer.reset(new StringReader("please divide this sentence"));
1022 assertTokenStreamContents(filter,
1023 new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"},
1024 new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27},
1025 new String[]{TypeAttributeImpl.DEFAULT_TYPE,"shingle",TypeAttributeImpl.DEFAULT_TYPE,"shingle",TypeAttributeImpl.DEFAULT_TYPE,"shingle",TypeAttributeImpl.DEFAULT_TYPE},
1026 new int[]{1,0,1,0,1,0,1}
1030 public void testOutputUnigramsIfNoShinglesSingleTokenCase() throws IOException {
1031 // Single token input with outputUnigrams==false is the primary case where
1032 // enabling this option should alter program behavior.
1033 this.shingleFilterTest(2, 2, TEST_SINGLE_TOKEN, SINGLE_TOKEN,
1034 SINGLE_TOKEN_INCREMENTS, SINGLE_TOKEN_TYPES,
1038 public void testOutputUnigramsIfNoShinglesWithSimpleBigram() throws IOException {
1039 // Here we expect the same result as with testBiGramFilter().
1040 this.shingleFilterTest(2, 2, TEST_TOKEN, BI_GRAM_TOKENS,
1041 BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES,
1045 public void testOutputUnigramsIfNoShinglesWithSimpleUnigramlessBigram() throws IOException {
1046 // Here we expect the same result as with testBiGramFilterWithoutUnigrams().
1047 this.shingleFilterTest(2, 2, TEST_TOKEN, BI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
1048 BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
1052 public void testOutputUnigramsIfNoShinglesWithMultipleInputTokens() throws IOException {
1053 // Test when the minimum shingle size is greater than the number of input tokens
1054 this.shingleFilterTest(7, 7, TEST_TOKEN, TEST_TOKEN,
1055 UNIGRAM_ONLY_POSITION_INCREMENTS, UNIGRAM_ONLY_TYPES,
1059 protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
1060 int[] positionIncrements, String[] types,
1061 boolean outputUnigrams)
1062 throws IOException {
1064 ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
1065 filter.setOutputUnigrams(outputUnigrams);
1066 shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
1069 protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle,
1070 Token[] tokensToCompare, int[] positionIncrements,
1071 String[] types, boolean outputUnigrams)
1072 throws IOException {
1073 ShingleFilter filter
1074 = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
1075 filter.setOutputUnigrams(outputUnigrams);
1076 shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
1079 protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle,
1080 Token[] tokensToCompare, int[] positionIncrements,
1081 String[] types, boolean outputUnigrams,
1082 boolean outputUnigramsIfNoShingles)
1083 throws IOException {
1084 ShingleFilter filter
1085 = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
1086 filter.setOutputUnigrams(outputUnigrams);
1087 filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
1088 shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
1091 protected void shingleFilterTest(String tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle,
1092 Token[] tokensToCompare, int[] positionIncrements,
1093 String[] types, boolean outputUnigrams)
1094 throws IOException {
1095 ShingleFilter filter
1096 = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
1097 filter.setTokenSeparator(tokenSeparator);
1098 filter.setOutputUnigrams(outputUnigrams);
1099 shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
1102 protected void shingleFilterTestCommon(ShingleFilter filter,
1103 Token[] tokensToCompare,
1104 int[] positionIncrements,
1106 throws IOException {
1107 String text[] = new String[tokensToCompare.length];
1108 int startOffsets[] = new int[tokensToCompare.length];
1109 int endOffsets[] = new int[tokensToCompare.length];
1111 for (int i = 0; i < tokensToCompare.length; i++) {
1112 text[i] = new String(tokensToCompare[i].buffer(),0, tokensToCompare[i].length());
1113 startOffsets[i] = tokensToCompare[i].startOffset();
1114 endOffsets[i] = tokensToCompare[i].endOffset();
1117 assertTokenStreamContents(filter, text, startOffsets, endOffsets, types, positionIncrements);
1120 private static Token createToken(String term, int start, int offset) {
1121 return createToken(term, start, offset, 1);
1124 private static Token createToken
1125 (String term, int start, int offset, int positionIncrement)
1127 Token token = new Token(start, offset);
1128 token.copyBuffer(term.toCharArray(), 0, term.length());
1129 token.setPositionIncrement(positionIncrement);