lucene-java-3.4.0/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java

   1 package org.apache.lucene.analysis.shingle;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.io.StringReader;
  22
  23 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
  24 import org.apache.lucene.analysis.Token;
  25 import org.apache.lucene.analysis.TokenStream;
  26 import org.apache.lucene.analysis.Tokenizer;
  27 import org.apache.lucene.analysis.WhitespaceTokenizer;
  28 import org.apache.lucene.analysis.tokenattributes.*;
  29
  30 public class ShingleFilterTest extends BaseTokenStreamTestCase {
  31
  32   public class TestTokenStream extends TokenStream {
  33
  34     protected int index = 0;
  35     protected Token[] testToken;
  36
  37     private CharTermAttribute termAtt;
  38     private OffsetAttribute offsetAtt;
  39     private PositionIncrementAttribute posIncrAtt;
  40     private TypeAttribute typeAtt;
  41
  42     public TestTokenStream(Token[] testToken) {
  43       super();
  44       this.testToken = testToken;
  45       this.termAtt = addAttribute(CharTermAttribute.class);
  46       this.offsetAtt = addAttribute(OffsetAttribute.class);
  47       this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  48       this.typeAtt = addAttribute(TypeAttribute.class);
  49     }
  50
  51     @Override
  52     public final boolean incrementToken() throws IOException {
  53       clearAttributes();
  54       if (index < testToken.length) {
  55         Token t = testToken[index++];
  56         termAtt.copyBuffer(t.buffer(), 0, t.length());
  57         offsetAtt.setOffset(t.startOffset(), t.endOffset());
  58         posIncrAtt.setPositionIncrement(t.getPositionIncrement());
  59         typeAtt.setType(TypeAttributeImpl.DEFAULT_TYPE);
  60         return true;
  61       } else {
  62         return false;
  63       }
  64     }
  65   }
  66
  67   public static final Token[] TEST_TOKEN = new Token[] {
  68       createToken("please", 0, 6),
  69       createToken("divide", 7, 13),
  70       createToken("this", 14, 18),
  71       createToken("sentence", 19, 27),
  72       createToken("into", 28, 32),
  73       createToken("shingles", 33, 39),
  74   };
  75
  76   public static final int[] UNIGRAM_ONLY_POSITION_INCREMENTS = new int[] {
  77     1, 1, 1, 1, 1, 1
  78   };
  79
  80   public static final String[] UNIGRAM_ONLY_TYPES = new String[] {
  81     "word", "word", "word", "word", "word", "word"
  82   };
  83
  84   public static Token[] testTokenWithHoles;
  85
  86   public static final Token[] BI_GRAM_TOKENS = new Token[] {
  87     createToken("please", 0, 6),
  88     createToken("please divide", 0, 13),
  89     createToken("divide", 7, 13),
  90     createToken("divide this", 7, 18),
  91     createToken("this", 14, 18),
  92     createToken("this sentence", 14, 27),
  93     createToken("sentence", 19, 27),
  94     createToken("sentence into", 19, 32),
  95     createToken("into", 28, 32),
  96     createToken("into shingles", 28, 39),
  97     createToken("shingles", 33, 39),
  98   };
  99
 100   public static final int[] BI_GRAM_POSITION_INCREMENTS = new int[] {
 101     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
 102   };
 103
 104   public static final String[] BI_GRAM_TYPES = new String[] {
 105     "word", "shingle", "word", "shingle", "word", "shingle", "word",
 106     "shingle", "word", "shingle", "word"
 107   };
 108
 109   public static final Token[] BI_GRAM_TOKENS_WITH_HOLES = new Token[] {
 110     createToken("please", 0, 6),
 111     createToken("please divide", 0, 13),
 112     createToken("divide", 7, 13),
 113     createToken("divide _", 7, 19),
 114     createToken("_ sentence", 19, 27),
 115     createToken("sentence", 19, 27),
 116     createToken("sentence _", 19, 33),
 117     createToken("_ shingles", 33, 39),
 118     createToken("shingles", 33, 39),
 119   };
 120
 121   public static final int[] BI_GRAM_POSITION_INCREMENTS_WITH_HOLES = new int[] {
 122     1, 0, 1, 0, 1, 1, 0, 1, 1
 123   };
 124
 125   private static final String[] BI_GRAM_TYPES_WITH_HOLES = {
 126     "word", "shingle",
 127     "word", "shingle", "shingle", "word", "shingle", "shingle", "word"
 128   };
 129
 130   public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
 131     createToken("please divide", 0, 13),
 132     createToken("divide this", 7, 18),
 133     createToken("this sentence", 14, 27),
 134     createToken("sentence into", 19, 32),
 135     createToken("into shingles", 28, 39),
 136   };
 137
 138   public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
 139     1, 1, 1, 1, 1
 140   };
 141
 142   public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
 143     "shingle", "shingle", "shingle", "shingle", "shingle"
 144   };
 145
 146   public static final Token[] BI_GRAM_TOKENS_WITH_HOLES_WITHOUT_UNIGRAMS = new Token[] {
 147     createToken("please divide", 0, 13),
 148     createToken("divide _", 7, 19),
 149     createToken("_ sentence", 19, 27),
 150     createToken("sentence _", 19, 33),
 151     createToken("_ shingles", 33, 39),
 152   };
 153
 154   public static final int[] BI_GRAM_POSITION_INCREMENTS_WITH_HOLES_WITHOUT_UNIGRAMS = new int[] {
 155     1, 1, 1, 1, 1, 1
 156   };
 157
 158
 159   public static final Token[] TEST_SINGLE_TOKEN = new Token[] {
 160     createToken("please", 0, 6)
 161   };
 162
 163   public static final Token[] SINGLE_TOKEN = new Token[] {
 164     createToken("please", 0, 6)
 165   };
 166
 167   public static final int[] SINGLE_TOKEN_INCREMENTS = new int[] {
 168     1
 169   };
 170
 171   public static final String[] SINGLE_TOKEN_TYPES = new String[] {
 172     "word"
 173   };
 174
 175   public static final Token[] EMPTY_TOKEN_ARRAY = new Token[] {
 176   };
 177
 178   public static final int[] EMPTY_TOKEN_INCREMENTS_ARRAY = new int[] {
 179   };
 180
 181   public static final String[] EMPTY_TOKEN_TYPES_ARRAY = new String[] {
 182   };
 183
 184   public static final Token[] TRI_GRAM_TOKENS = new Token[] {
 185     createToken("please", 0, 6),
 186     createToken("please divide", 0, 13),
 187     createToken("please divide this", 0, 18),
 188     createToken("divide", 7, 13),
 189     createToken("divide this", 7, 18),
 190     createToken("divide this sentence", 7, 27),
 191     createToken("this", 14, 18),
 192     createToken("this sentence", 14, 27),
 193     createToken("this sentence into", 14, 32),
 194     createToken("sentence", 19, 27),
 195     createToken("sentence into", 19, 32),
 196     createToken("sentence into shingles", 19, 39),
 197     createToken("into", 28, 32),
 198     createToken("into shingles", 28, 39),
 199     createToken("shingles", 33, 39)
 200   };
 201
 202   public static final int[] TRI_GRAM_POSITION_INCREMENTS = new int[] {
 203     1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
 204   };
 205
 206   public static final String[] TRI_GRAM_TYPES = new String[] {
 207     "word", "shingle", "shingle",
 208     "word", "shingle", "shingle",
 209     "word", "shingle", "shingle",
 210     "word", "shingle", "shingle",
 211     "word", "shingle",
 212     "word"
 213   };
 214
 215   public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
 216     createToken("please divide", 0, 13),
 217     createToken("please divide this", 0, 18),
 218     createToken("divide this", 7, 18),
 219     createToken("divide this sentence", 7, 27),
 220     createToken("this sentence", 14, 27),
 221     createToken("this sentence into", 14, 32),
 222     createToken("sentence into", 19, 32),
 223     createToken("sentence into shingles", 19, 39),
 224     createToken("into shingles", 28, 39),
 225   };
 226
 227   public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
 228     1, 0, 1, 0, 1, 0, 1, 0, 1
 229   };
 230
 231   public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
 232     "shingle", "shingle",
 233     "shingle", "shingle",
 234     "shingle", "shingle",
 235     "shingle", "shingle",
 236     "shingle",
 237   };
 238
 239   public static final Token[] FOUR_GRAM_TOKENS = new Token[] {
 240     createToken("please", 0, 6),
 241     createToken("please divide", 0, 13),
 242     createToken("please divide this", 0, 18),
 243     createToken("please divide this sentence", 0, 27),
 244     createToken("divide", 7, 13),
 245     createToken("divide this", 7, 18),
 246     createToken("divide this sentence", 7, 27),
 247     createToken("divide this sentence into", 7, 32),
 248     createToken("this", 14, 18),
 249     createToken("this sentence", 14, 27),
 250     createToken("this sentence into", 14, 32),
 251     createToken("this sentence into shingles", 14, 39),
 252     createToken("sentence", 19, 27),
 253     createToken("sentence into", 19, 32),
 254     createToken("sentence into shingles", 19, 39),
 255     createToken("into", 28, 32),
 256     createToken("into shingles", 28, 39),
 257     createToken("shingles", 33, 39)
 258   };
 259
 260   public static final int[] FOUR_GRAM_POSITION_INCREMENTS = new int[] {
 261     1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1
 262   };
 263
 264   public static final String[] FOUR_GRAM_TYPES = new String[] {
 265     "word", "shingle", "shingle", "shingle",
 266     "word", "shingle", "shingle", "shingle",
 267     "word", "shingle", "shingle", "shingle",
 268     "word", "shingle", "shingle",
 269     "word", "shingle",
 270     "word"
 271   };
 272
 273   public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
 274     createToken("please divide", 0, 13),
 275     createToken("please divide this", 0, 18),
 276     createToken("please divide this sentence", 0, 27),
 277     createToken("divide this", 7, 18),
 278     createToken("divide this sentence", 7, 27),
 279     createToken("divide this sentence into", 7, 32),
 280     createToken("this sentence", 14, 27),
 281     createToken("this sentence into", 14, 32),
 282     createToken("this sentence into shingles", 14, 39),
 283     createToken("sentence into", 19, 32),
 284     createToken("sentence into shingles", 19, 39),
 285     createToken("into shingles", 28, 39),
 286   };
 287
 288   public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
 289     1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
 290   };
 291
 292   public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
 293     "shingle", "shingle",
 294     "shingle", "shingle",
 295     "shingle", "shingle",
 296     "shingle", "shingle",
 297     "shingle", "shingle",
 298     "shingle", "shingle",
 299
 300   };
 301
 302   public static final Token[] TRI_GRAM_TOKENS_MIN_TRI_GRAM = new Token[] {
 303     createToken("please", 0, 6),
 304     createToken("please divide this", 0, 18),
 305     createToken("divide", 7, 13),
 306     createToken("divide this sentence", 7, 27),
 307     createToken("this", 14, 18),
 308     createToken("this sentence into", 14, 32),
 309     createToken("sentence", 19, 27),
 310     createToken("sentence into shingles", 19, 39),
 311     createToken("into", 28, 32),
 312     createToken("shingles", 33, 39)
 313   };
 314
 315   public static final int[] TRI_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM = new int[] {
 316     1, 0, 1, 0, 1, 0, 1, 0, 1, 1
 317   };
 318
 319   public static final String[] TRI_GRAM_TYPES_MIN_TRI_GRAM = new String[] {
 320     "word", "shingle",
 321     "word", "shingle",
 322     "word", "shingle",
 323     "word", "shingle",
 324     "word",
 325     "word"
 326   };
 327
 328   public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new Token[] {
 329     createToken("please divide this", 0, 18),
 330     createToken("divide this sentence", 7, 27),
 331     createToken("this sentence into", 14, 32),
 332     createToken("sentence into shingles", 19, 39)
 333   };
 334
 335   public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new int[] {
 336     1, 1, 1, 1
 337   };
 338
 339   public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new String[] {
 340     "shingle",
 341     "shingle",
 342     "shingle",
 343     "shingle"
 344   };
 345
 346   public static final Token[] FOUR_GRAM_TOKENS_MIN_TRI_GRAM = new Token[] {
 347     createToken("please", 0, 6),
 348     createToken("please divide this", 0, 18),
 349     createToken("please divide this sentence", 0, 27),
 350     createToken("divide", 7, 13),
 351     createToken("divide this sentence", 7, 27),
 352     createToken("divide this sentence into", 7, 32),
 353     createToken("this", 14, 18),
 354     createToken("this sentence into", 14, 32),
 355     createToken("this sentence into shingles", 14, 39),
 356     createToken("sentence", 19, 27),
 357     createToken("sentence into shingles", 19, 39),
 358     createToken("into", 28, 32),
 359     createToken("shingles", 33, 39)
 360   };
 361
 362   public static final int[] FOUR_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM = new int[] {
 363     1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1
 364   };
 365
 366   public static final String[] FOUR_GRAM_TYPES_MIN_TRI_GRAM = new String[] {
 367     "word", "shingle", "shingle",
 368     "word", "shingle", "shingle",
 369     "word", "shingle", "shingle",
 370     "word", "shingle",
 371     "word",
 372     "word"
 373   };
 374
 375   public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new Token[] {
 376     createToken("please divide this", 0, 18),
 377     createToken("please divide this sentence", 0, 27),
 378     createToken("divide this sentence", 7, 27),
 379     createToken("divide this sentence into", 7, 32),
 380     createToken("this sentence into", 14, 32),
 381     createToken("this sentence into shingles", 14, 39),
 382     createToken("sentence into shingles", 19, 39),
 383   };
 384
 385   public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new int[] {
 386     1, 0, 1, 0, 1, 0, 1
 387   };
 388
 389   public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new String[] {
 390     "shingle", "shingle",
 391     "shingle", "shingle",
 392     "shingle", "shingle",
 393     "shingle"
 394   };
 395
 396   public static final Token[] FOUR_GRAM_TOKENS_MIN_FOUR_GRAM = new Token[] {
 397     createToken("please", 0, 6),
 398     createToken("please divide this sentence", 0, 27),
 399     createToken("divide", 7, 13),
 400     createToken("divide this sentence into", 7, 32),
 401     createToken("this", 14, 18),
 402     createToken("this sentence into shingles", 14, 39),
 403     createToken("sentence", 19, 27),
 404     createToken("into", 28, 32),
 405     createToken("shingles", 33, 39)
 406   };
 407
 408   public static final int[] FOUR_GRAM_POSITION_INCREMENTS_MIN_FOUR_GRAM = new int[] {
 409     1, 0, 1, 0, 1, 0, 1, 1, 1
 410   };
 411
 412   public static final String[] FOUR_GRAM_TYPES_MIN_FOUR_GRAM = new String[] {
 413     "word", "shingle",
 414     "word", "shingle",
 415     "word", "shingle",
 416     "word",
 417     "word",
 418     "word"
 419   };
 420
 421   public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM = new Token[] {
 422     createToken("please divide this sentence", 0, 27),
 423     createToken("divide this sentence into", 7, 32),
 424     createToken("this sentence into shingles", 14, 39),
 425   };
 426
 427   public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM = new int[] {
 428     1, 1, 1
 429   };
 430
 431   public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM = new String[] {
 432     "shingle",
 433     "shingle",
 434     "shingle"
 435   };
 436
 437   public static final Token[] BI_GRAM_TOKENS_NO_SEPARATOR = new Token[] {
 438     createToken("please", 0, 6),
 439     createToken("pleasedivide", 0, 13),
 440     createToken("divide", 7, 13),
 441     createToken("dividethis", 7, 18),
 442     createToken("this", 14, 18),
 443     createToken("thissentence", 14, 27),
 444     createToken("sentence", 19, 27),
 445     createToken("sentenceinto", 19, 32),
 446     createToken("into", 28, 32),
 447     createToken("intoshingles", 28, 39),
 448     createToken("shingles", 33, 39),
 449   };
 450
 451   public static final int[] BI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR = new int[] {
 452     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
 453   };
 454
 455   public static final String[] BI_GRAM_TYPES_NO_SEPARATOR = new String[] {
 456     "word", "shingle", "word", "shingle", "word", "shingle", "word",
 457     "shingle", "word", "shingle", "word"
 458   };
 459
 460   public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new Token[] {
 461     createToken("pleasedivide", 0, 13),
 462     createToken("dividethis", 7, 18),
 463     createToken("thissentence", 14, 27),
 464     createToken("sentenceinto", 19, 32),
 465     createToken("intoshingles", 28, 39),
 466   };
 467
 468   public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new int[] {
 469     1, 1, 1, 1, 1
 470   };
 471
 472   public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR = new String[] {
 473     "shingle", "shingle", "shingle", "shingle", "shingle"
 474   };
 475
 476   public static final Token[] TRI_GRAM_TOKENS_NO_SEPARATOR = new Token[] {
 477     createToken("please", 0, 6),
 478     createToken("pleasedivide", 0, 13),
 479     createToken("pleasedividethis", 0, 18),
 480     createToken("divide", 7, 13),
 481     createToken("dividethis", 7, 18),
 482     createToken("dividethissentence", 7, 27),
 483     createToken("this", 14, 18),
 484     createToken("thissentence", 14, 27),
 485     createToken("thissentenceinto", 14, 32),
 486     createToken("sentence", 19, 27),
 487     createToken("sentenceinto", 19, 32),
 488     createToken("sentenceintoshingles", 19, 39),
 489     createToken("into", 28, 32),
 490     createToken("intoshingles", 28, 39),
 491     createToken("shingles", 33, 39)
 492   };
 493
 494   public static final int[] TRI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR = new int[] {
 495     1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
 496   };
 497
 498   public static final String[] TRI_GRAM_TYPES_NO_SEPARATOR = new String[] {
 499     "word", "shingle", "shingle",
 500     "word", "shingle", "shingle",
 501     "word", "shingle", "shingle",
 502     "word", "shingle", "shingle",
 503     "word", "shingle",
 504     "word"
 505   };
 506
 507   public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new Token[] {
 508     createToken("pleasedivide", 0, 13),
 509     createToken("pleasedividethis", 0, 18),
 510     createToken("dividethis", 7, 18),
 511     createToken("dividethissentence", 7, 27),
 512     createToken("thissentence", 14, 27),
 513     createToken("thissentenceinto", 14, 32),
 514     createToken("sentenceinto", 19, 32),
 515     createToken("sentenceintoshingles", 19, 39),
 516     createToken("intoshingles", 28, 39),
 517   };
 518
 519   public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new int[] {
 520     1, 0, 1, 0, 1, 0, 1, 0, 1
 521   };
 522
 523   public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR = new String[] {
 524     "shingle", "shingle",
 525     "shingle", "shingle",
 526     "shingle", "shingle",
 527     "shingle", "shingle",
 528     "shingle",
 529   };
 530
 531   public static final Token[] BI_GRAM_TOKENS_ALT_SEPARATOR = new Token[] {
 532     createToken("please", 0, 6),
 533     createToken("please<SEP>divide", 0, 13),
 534     createToken("divide", 7, 13),
 535     createToken("divide<SEP>this", 7, 18),
 536     createToken("this", 14, 18),
 537     createToken("this<SEP>sentence", 14, 27),
 538     createToken("sentence", 19, 27),
 539     createToken("sentence<SEP>into", 19, 32),
 540     createToken("into", 28, 32),
 541     createToken("into<SEP>shingles", 28, 39),
 542     createToken("shingles", 33, 39),
 543   };
 544
 545   public static final int[] BI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR = new int[] {
 546     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
 547   };
 548
 549   public static final String[] BI_GRAM_TYPES_ALT_SEPARATOR = new String[] {
 550     "word", "shingle", "word", "shingle", "word", "shingle", "word",
 551     "shingle", "word", "shingle", "word"
 552   };
 553
 554   public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new Token[] {
 555     createToken("please<SEP>divide", 0, 13),
 556     createToken("divide<SEP>this", 7, 18),
 557     createToken("this<SEP>sentence", 14, 27),
 558     createToken("sentence<SEP>into", 19, 32),
 559     createToken("into<SEP>shingles", 28, 39),
 560   };
 561
 562   public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new int[] {
 563     1, 1, 1, 1, 1
 564   };
 565
 566   public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new String[] {
 567     "shingle", "shingle", "shingle", "shingle", "shingle"
 568   };
 569
 570   public static final Token[] TRI_GRAM_TOKENS_ALT_SEPARATOR = new Token[] {
 571     createToken("please", 0, 6),
 572     createToken("please<SEP>divide", 0, 13),
 573     createToken("please<SEP>divide<SEP>this", 0, 18),
 574     createToken("divide", 7, 13),
 575     createToken("divide<SEP>this", 7, 18),
 576     createToken("divide<SEP>this<SEP>sentence", 7, 27),
 577     createToken("this", 14, 18),
 578     createToken("this<SEP>sentence", 14, 27),
 579     createToken("this<SEP>sentence<SEP>into", 14, 32),
 580     createToken("sentence", 19, 27),
 581     createToken("sentence<SEP>into", 19, 32),
 582     createToken("sentence<SEP>into<SEP>shingles", 19, 39),
 583     createToken("into", 28, 32),
 584     createToken("into<SEP>shingles", 28, 39),
 585     createToken("shingles", 33, 39)
 586   };
 587
 588   public static final int[] TRI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR = new int[] {
 589     1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
 590   };
 591
 592   public static final String[] TRI_GRAM_TYPES_ALT_SEPARATOR = new String[] {
 593     "word", "shingle", "shingle",
 594     "word", "shingle", "shingle",
 595     "word", "shingle", "shingle",
 596     "word", "shingle", "shingle",
 597     "word", "shingle",
 598     "word"
 599   };
 600
 601   public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new Token[] {
 602     createToken("please<SEP>divide", 0, 13),
 603     createToken("please<SEP>divide<SEP>this", 0, 18),
 604     createToken("divide<SEP>this", 7, 18),
 605     createToken("divide<SEP>this<SEP>sentence", 7, 27),
 606     createToken("this<SEP>sentence", 14, 27),
 607     createToken("this<SEP>sentence<SEP>into", 14, 32),
 608     createToken("sentence<SEP>into", 19, 32),
 609     createToken("sentence<SEP>into<SEP>shingles", 19, 39),
 610     createToken("into<SEP>shingles", 28, 39),
 611   };
 612
 613   public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new int[] {
 614     1, 0, 1, 0, 1, 0, 1, 0, 1
 615   };
 616
 617   public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new String[] {
 618     "shingle", "shingle",
 619     "shingle", "shingle",
 620     "shingle", "shingle",
 621     "shingle", "shingle",
 622     "shingle",
 623   };
 624
 625   public static final Token[] TRI_GRAM_TOKENS_NULL_SEPARATOR = new Token[] {
 626     createToken("please", 0, 6),
 627     createToken("pleasedivide", 0, 13),
 628     createToken("pleasedividethis", 0, 18),
 629     createToken("divide", 7, 13),
 630     createToken("dividethis", 7, 18),
 631     createToken("dividethissentence", 7, 27),
 632     createToken("this", 14, 18),
 633     createToken("thissentence", 14, 27),
 634     createToken("thissentenceinto", 14, 32),
 635     createToken("sentence", 19, 27),
 636     createToken("sentenceinto", 19, 32),
 637     createToken("sentenceintoshingles", 19, 39),
 638     createToken("into", 28, 32),
 639     createToken("intoshingles", 28, 39),
 640     createToken("shingles", 33, 39)
 641   };
 642
 643   public static final int[] TRI_GRAM_POSITION_INCREMENTS_NULL_SEPARATOR = new int[] {
 644     1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
 645   };
 646
 647   public static final String[] TRI_GRAM_TYPES_NULL_SEPARATOR = new String[] {
 648     "word", "shingle", "shingle",
 649     "word", "shingle", "shingle",
 650     "word", "shingle", "shingle",
 651     "word", "shingle", "shingle",
 652     "word", "shingle",
 653     "word"
 654   };
 655
 656   public static final Token[] TEST_TOKEN_POS_INCR_EQUAL_TO_N = new Token[] {
 657     createToken("please", 0, 6),
 658     createToken("divide", 7, 13),
 659     createToken("this", 14, 18),
 660     createToken("sentence", 29, 37, 3),
 661     createToken("into", 38, 42),
 662     createToken("shingles", 43, 49),
 663   };
 664
 665   public static final Token[] TRI_GRAM_TOKENS_POS_INCR_EQUAL_TO_N = new Token[] {
 666     createToken("please", 0, 6),
 667     createToken("please divide", 0, 13),
 668     createToken("please divide this", 0, 18),
 669     createToken("divide", 7, 13),
 670     createToken("divide this", 7, 18),
 671     createToken("divide this _", 7, 29),
 672     createToken("this", 14, 18),
 673     createToken("this _", 14, 29),
 674     createToken("this _ _", 14, 29),
 675     createToken("_ _ sentence", 29, 37),
 676     createToken("_ sentence", 29, 37),
 677     createToken("_ sentence into", 29, 42),
 678     createToken("sentence", 29, 37),
 679     createToken("sentence into", 29, 42),
 680     createToken("sentence into shingles", 29, 49),
 681     createToken("into", 38, 42),
 682     createToken("into shingles", 38, 49),
 683     createToken("shingles", 43, 49)
 684   };
 685
 686   public static final int[] TRI_GRAM_POSITION_INCREMENTS_POS_INCR_EQUAL_TO_N = new int[] {
 687     1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1
 688   };
 689
 690   public static final String[] TRI_GRAM_TYPES_POS_INCR_EQUAL_TO_N = new String[] {
 691     "word", "shingle", "shingle",
 692     "word", "shingle", "shingle",
 693     "word", "shingle", "shingle",
 694     "shingle", "shingle", "shingle", "word", "shingle", "shingle",
 695     "word", "shingle",
 696     "word"
 697   };
 698
 699   public static final Token[] TRI_GRAM_TOKENS_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS = new Token[] {
 700     createToken("please divide", 0, 13),
 701     createToken("please divide this", 0, 18),
 702     createToken("divide this", 7, 18),
 703     createToken("divide this _", 7, 29),
 704     createToken("this _", 14, 29),
 705     createToken("this _ _", 14, 29),
 706     createToken("_ _ sentence", 29, 37),
 707     createToken("_ sentence", 29, 37),
 708     createToken("_ sentence into", 29, 42),
 709     createToken("sentence into", 29, 42),
 710     createToken("sentence into shingles", 29, 49),
 711     createToken("into shingles", 38, 49),
 712   };
 713
 714   public static final int[] TRI_GRAM_POSITION_INCREMENTS_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS = new int[] {
 715     1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1
 716   };
 717
 718   public static final String[] TRI_GRAM_TYPES_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS = new String[] {
 719     "shingle", "shingle",
 720     "shingle", "shingle",
 721     "shingle", "shingle",
 722     "shingle", "shingle", "shingle",
 723     "shingle", "shingle",
 724     "shingle",
 725   };
 726
 727   public static final Token[] TEST_TOKEN_POS_INCR_GREATER_THAN_N = new Token[] {
 728     createToken("please", 0, 6),
 729     createToken("divide", 57, 63, 8),
 730     createToken("this", 64, 68),
 731     createToken("sentence", 69, 77),
 732     createToken("into", 78, 82),
 733     createToken("shingles", 83, 89),
 734   };
 735
 736   public static final Token[] TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N = new Token[] {
 737     createToken("please", 0, 6),
 738     createToken("please _", 0, 57),
 739     createToken("please _ _", 0, 57),
 740     createToken("_ _ divide", 57, 63),
 741     createToken("_ divide", 57, 63),
 742     createToken("_ divide this", 57, 68),
 743     createToken("divide", 57, 63),
 744     createToken("divide this", 57, 68),
 745     createToken("divide this sentence", 57, 77),
 746     createToken("this", 64, 68),
 747     createToken("this sentence", 64, 77),
 748     createToken("this sentence into", 64, 82),
 749     createToken("sentence", 69, 77),
 750     createToken("sentence into", 69, 82),
 751     createToken("sentence into shingles", 69, 89),
 752     createToken("into", 78, 82),
 753     createToken("into shingles", 78, 89),
 754     createToken("shingles", 83, 89)
 755   };
 756
 757   public static final int[] TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N = new int[] {
 758     1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
 759   };
 760   public static final String[] TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N = new String[] {
 761     "word", "shingle", "shingle",
 762     "shingle",
 763     "shingle", "shingle",
 764     "word", "shingle", "shingle",
 765     "word", "shingle", "shingle",
 766     "word", "shingle", "shingle",
 767     "word", "shingle",
 768     "word"
 769   };
 770
 771   public static final Token[] TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS = new Token[] {
 772     createToken("please _", 0, 57),
 773     createToken("please _ _", 0, 57),
 774     createToken("_ _ divide", 57, 63),
 775     createToken("_ divide", 57, 63),
 776     createToken("_ divide this", 57, 68),
 777     createToken("divide this", 57, 68),
 778     createToken("divide this sentence", 57, 77),
 779     createToken("this sentence", 64, 77),
 780     createToken("this sentence into", 64, 82),
 781     createToken("sentence into", 69, 82),
 782     createToken("sentence into shingles", 69, 89),
 783     createToken("into shingles", 78, 89),
 784   };
 785
 786   public static final int[] TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS = new int[] {
 787     1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1
 788   };
 789
 790   public static final String[] TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS = new String[] {
 791     "shingle", "shingle",
 792     "shingle", "shingle",
 793     "shingle", "shingle",
 794     "shingle", "shingle", "shingle", "shingle", "shingle",
 795     "shingle",
 796   };
 797
 798   @Override
 799   public void setUp() throws Exception {
 800     super.setUp();
 801     testTokenWithHoles = new Token[] {
 802       createToken("please", 0, 6),
 803       createToken("divide", 7, 13),
 804       createToken("sentence", 19, 27, 2),
 805       createToken("shingles", 33, 39, 2),
 806     };
 807   }
 808
 809   /*
 810    * Class under test for void ShingleFilter(TokenStream, int)
 811    */
 812   public void testBiGramFilter() throws IOException {
 813     this.shingleFilterTest(2, TEST_TOKEN, BI_GRAM_TOKENS,
 814                            BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES,
 815                            true);
 816   }
 817
 818   public void testBiGramFilterWithHoles() throws IOException {
 819     this.shingleFilterTest(2, testTokenWithHoles, BI_GRAM_TOKENS_WITH_HOLES,
 820                            BI_GRAM_POSITION_INCREMENTS_WITH_HOLES,
 821                            BI_GRAM_TYPES_WITH_HOLES,
 822                            true);
 823   }
 824
 825   public void testBiGramFilterWithoutUnigrams() throws IOException {
 826     this.shingleFilterTest(2, TEST_TOKEN, BI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
 827                            BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
 828                            false);
 829   }
 830
 831   public void testBiGramFilterWithHolesWithoutUnigrams() throws IOException {
 832     this.shingleFilterTest(2, testTokenWithHoles, BI_GRAM_TOKENS_WITH_HOLES_WITHOUT_UNIGRAMS,
 833                            BI_GRAM_POSITION_INCREMENTS_WITH_HOLES_WITHOUT_UNIGRAMS, BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
 834                            false);
 835   }
 836
 837   public void testBiGramFilterWithSingleToken() throws IOException {
 838     this.shingleFilterTest(2, TEST_SINGLE_TOKEN, SINGLE_TOKEN,
 839                            SINGLE_TOKEN_INCREMENTS, SINGLE_TOKEN_TYPES,
 840                            true);
 841   }
 842
 843   public void testBiGramFilterWithSingleTokenWithoutUnigrams() throws IOException {
 844     this.shingleFilterTest(2, TEST_SINGLE_TOKEN, EMPTY_TOKEN_ARRAY,
 845                            EMPTY_TOKEN_INCREMENTS_ARRAY, EMPTY_TOKEN_TYPES_ARRAY,
 846                            false);
 847   }
 848
 849   public void testBiGramFilterWithEmptyTokenStream() throws IOException {
 850     this.shingleFilterTest(2, EMPTY_TOKEN_ARRAY, EMPTY_TOKEN_ARRAY,
 851                            EMPTY_TOKEN_INCREMENTS_ARRAY, EMPTY_TOKEN_TYPES_ARRAY,
 852                            true);
 853   }
 854
 855   public void testBiGramFilterWithEmptyTokenStreamWithoutUnigrams() throws IOException {
 856     this.shingleFilterTest(2, EMPTY_TOKEN_ARRAY, EMPTY_TOKEN_ARRAY,
 857                            EMPTY_TOKEN_INCREMENTS_ARRAY, EMPTY_TOKEN_TYPES_ARRAY,
 858                            false);
 859   }
 860
 861   public void testTriGramFilter() throws IOException {
 862     this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS,
 863                            TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES,
 864                            true);
 865   }
 866
 867   public void testTriGramFilterWithoutUnigrams() throws IOException {
 868     this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
 869                            TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, TRI_GRAM_TYPES_WITHOUT_UNIGRAMS,
 870                            false);
 871   }
 872
 873   public void testFourGramFilter() throws IOException {
 874     this.shingleFilterTest(4, TEST_TOKEN, FOUR_GRAM_TOKENS,
 875         FOUR_GRAM_POSITION_INCREMENTS, FOUR_GRAM_TYPES,
 876                            true);
 877   }
 878
 879   public void testFourGramFilterWithoutUnigrams() throws IOException {
 880     this.shingleFilterTest(4, TEST_TOKEN, FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS,
 881         FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS,
 882         FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS, false);
 883   }
 884
 885
 886   public void testTriGramFilterMinTriGram() throws IOException {
 887     this.shingleFilterTest(3, 3, TEST_TOKEN, TRI_GRAM_TOKENS_MIN_TRI_GRAM,
 888                            TRI_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM,
 889                            TRI_GRAM_TYPES_MIN_TRI_GRAM,
 890                            true);
 891   }
 892
 893   public void testTriGramFilterWithoutUnigramsMinTriGram() throws IOException {
 894     this.shingleFilterTest(3, 3, TEST_TOKEN,
 895                            TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
 896                            TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
 897                            TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
 898                            false);
 899   }
 900
 901   public void testFourGramFilterMinTriGram() throws IOException {
 902     this.shingleFilterTest(3, 4, TEST_TOKEN, FOUR_GRAM_TOKENS_MIN_TRI_GRAM,
 903                            FOUR_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM,
 904                            FOUR_GRAM_TYPES_MIN_TRI_GRAM,
 905                            true);
 906   }
 907
 908   public void testFourGramFilterWithoutUnigramsMinTriGram() throws IOException {
 909     this.shingleFilterTest(3, 4, TEST_TOKEN,
 910                            FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
 911                            FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
 912                            FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM, false);
 913   }
 914
 915   public void testFourGramFilterMinFourGram() throws IOException {
 916     this.shingleFilterTest(4, 4, TEST_TOKEN, FOUR_GRAM_TOKENS_MIN_FOUR_GRAM,
 917                            FOUR_GRAM_POSITION_INCREMENTS_MIN_FOUR_GRAM,
 918                            FOUR_GRAM_TYPES_MIN_FOUR_GRAM,
 919                            true);
 920   }
 921
 922   public void testFourGramFilterWithoutUnigramsMinFourGram() throws IOException {
 923     this.shingleFilterTest(4, 4, TEST_TOKEN,
 924                            FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM,
 925                            FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM,
 926                            FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM, false);
 927   }
 928
 929   public void testBiGramFilterNoSeparator() throws IOException {
 930     this.shingleFilterTest("", 2, 2, TEST_TOKEN, BI_GRAM_TOKENS_NO_SEPARATOR,
 931                            BI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR,
 932                            BI_GRAM_TYPES_NO_SEPARATOR, true);
 933   }
 934
 935   public void testBiGramFilterWithoutUnigramsNoSeparator() throws IOException {
 936     this.shingleFilterTest("", 2, 2, TEST_TOKEN,
 937                            BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
 938                            BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
 939                            BI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR,
 940                            false);
 941   }
 942   public void testTriGramFilterNoSeparator() throws IOException {
 943     this.shingleFilterTest("", 2, 3, TEST_TOKEN, TRI_GRAM_TOKENS_NO_SEPARATOR,
 944                            TRI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR,
 945                            TRI_GRAM_TYPES_NO_SEPARATOR, true);
 946   }
 947
 948   public void testTriGramFilterWithoutUnigramsNoSeparator() throws IOException {
 949     this.shingleFilterTest("", 2, 3, TEST_TOKEN,
 950                            TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
 951                            TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
 952                            TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR, false);
 953   }
 954
 955   public void testBiGramFilterAltSeparator() throws IOException {
 956     this.shingleFilterTest("<SEP>", 2, 2, TEST_TOKEN, BI_GRAM_TOKENS_ALT_SEPARATOR,
 957                            BI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR,
 958                            BI_GRAM_TYPES_ALT_SEPARATOR, true);
 959   }
 960
 961   public void testBiGramFilterWithoutUnigramsAltSeparator() throws IOException {
 962     this.shingleFilterTest("<SEP>", 2, 2, TEST_TOKEN,
 963                            BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
 964                            BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
 965                            BI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
 966                            false);
 967   }
 968   public void testTriGramFilterAltSeparator() throws IOException {
 969     this.shingleFilterTest("<SEP>", 2, 3, TEST_TOKEN, TRI_GRAM_TOKENS_ALT_SEPARATOR,
 970                            TRI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR,
 971                            TRI_GRAM_TYPES_ALT_SEPARATOR, true);
 972   }
 973
 974   public void testTriGramFilterWithoutUnigramsAltSeparator() throws IOException {
 975     this.shingleFilterTest("<SEP>", 2, 3, TEST_TOKEN,
 976                            TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
 977                            TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
 978                            TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR, false);
 979   }
 980
 981   public void testTriGramFilterNullSeparator() throws IOException {
 982     this.shingleFilterTest(null, 2, 3, TEST_TOKEN, TRI_GRAM_TOKENS_NULL_SEPARATOR,
 983                            TRI_GRAM_POSITION_INCREMENTS_NULL_SEPARATOR,
 984                            TRI_GRAM_TYPES_NULL_SEPARATOR, true);
 985   }
 986
 987   public void testPositionIncrementEqualToN() throws IOException {
 988     this.shingleFilterTest(2, 3, TEST_TOKEN_POS_INCR_EQUAL_TO_N, TRI_GRAM_TOKENS_POS_INCR_EQUAL_TO_N,
 989                            TRI_GRAM_POSITION_INCREMENTS_POS_INCR_EQUAL_TO_N,
 990                            TRI_GRAM_TYPES_POS_INCR_EQUAL_TO_N, true);
 991   }
 992
 993   public void testPositionIncrementEqualToNWithoutUnigrams() throws IOException {
 994     this.shingleFilterTest(2, 3, TEST_TOKEN_POS_INCR_EQUAL_TO_N, TRI_GRAM_TOKENS_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS,
 995                            TRI_GRAM_POSITION_INCREMENTS_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS,
 996                            TRI_GRAM_TYPES_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS, false);
 997   }
 998
 999
1000   public void testPositionIncrementGreaterThanN() throws IOException {
1001     this.shingleFilterTest(2, 3, TEST_TOKEN_POS_INCR_GREATER_THAN_N, TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N,
1002                            TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N,
1003                            TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N, true);
1004   }
1005
1006   public void testPositionIncrementGreaterThanNWithoutUnigrams() throws IOException {
1007     this.shingleFilterTest(2, 3, TEST_TOKEN_POS_INCR_GREATER_THAN_N, TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS,
1008                            TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS,
1009                            TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS, false);
1010   }
1011
1012   public void testReset() throws Exception {
1013     Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("please divide this sentence"));
1014     TokenStream filter = new ShingleFilter(wsTokenizer, 2);
1015     assertTokenStreamContents(filter,
1016       new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"},
1017       new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27},
1018       new String[]{TypeAttributeImpl.DEFAULT_TYPE,"shingle",TypeAttributeImpl.DEFAULT_TYPE,"shingle",TypeAttributeImpl.DEFAULT_TYPE,"shingle",TypeAttributeImpl.DEFAULT_TYPE},
1019       new int[]{1,0,1,0,1,0,1}
1020     );
1021     wsTokenizer.reset(new StringReader("please divide this sentence"));
1022     assertTokenStreamContents(filter,
1023       new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"},
1024       new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27},
1025       new String[]{TypeAttributeImpl.DEFAULT_TYPE,"shingle",TypeAttributeImpl.DEFAULT_TYPE,"shingle",TypeAttributeImpl.DEFAULT_TYPE,"shingle",TypeAttributeImpl.DEFAULT_TYPE},
1026       new int[]{1,0,1,0,1,0,1}
1027     );
1028   }
1029
1030   public void testOutputUnigramsIfNoShinglesSingleTokenCase() throws IOException {
1031     // Single token input with outputUnigrams==false is the primary case where
1032     // enabling this option should alter program behavior.
1033     this.shingleFilterTest(2, 2, TEST_SINGLE_TOKEN, SINGLE_TOKEN,
1034                            SINGLE_TOKEN_INCREMENTS, SINGLE_TOKEN_TYPES,
1035                            false, true);
1036   }
1037
1038   public void testOutputUnigramsIfNoShinglesWithSimpleBigram() throws IOException {
1039     // Here we expect the same result as with testBiGramFilter().
1040     this.shingleFilterTest(2, 2, TEST_TOKEN, BI_GRAM_TOKENS,
1041                            BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES,
1042                            true, true);
1043   }
1044
1045   public void testOutputUnigramsIfNoShinglesWithSimpleUnigramlessBigram() throws IOException {
1046     // Here we expect the same result as with testBiGramFilterWithoutUnigrams().
1047     this.shingleFilterTest(2, 2, TEST_TOKEN, BI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
1048                            BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
1049                            false, true);
1050   }
1051
1052   public void testOutputUnigramsIfNoShinglesWithMultipleInputTokens() throws IOException {
1053     // Test when the minimum shingle size is greater than the number of input tokens
1054     this.shingleFilterTest(7, 7, TEST_TOKEN, TEST_TOKEN,
1055                            UNIGRAM_ONLY_POSITION_INCREMENTS, UNIGRAM_ONLY_TYPES,
1056                            false, true);
1057   }
1058
1059   protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
1060                                    int[] positionIncrements, String[] types,
1061                                    boolean outputUnigrams)
1062     throws IOException {
1063
1064     ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
1065     filter.setOutputUnigrams(outputUnigrams);
1066     shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
1067   }
1068
1069   protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle,
1070                                    Token[] tokensToCompare, int[] positionIncrements,
1071                                    String[] types, boolean outputUnigrams)
1072     throws IOException {
1073     ShingleFilter filter
1074       = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
1075     filter.setOutputUnigrams(outputUnigrams);
1076     shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
1077   }
1078
1079   protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle,
1080                                    Token[] tokensToCompare, int[] positionIncrements,
1081                                    String[] types, boolean outputUnigrams,
1082                                    boolean outputUnigramsIfNoShingles)
1083     throws IOException {
1084     ShingleFilter filter
1085       = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
1086     filter.setOutputUnigrams(outputUnigrams);
1087     filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
1088     shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
1089   }
1090
1091   protected void shingleFilterTest(String tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle,
1092                                    Token[] tokensToCompare, int[] positionIncrements,
1093                                    String[] types, boolean outputUnigrams)
1094     throws IOException {
1095     ShingleFilter filter
1096       = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
1097     filter.setTokenSeparator(tokenSeparator);
1098     filter.setOutputUnigrams(outputUnigrams);
1099     shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
1100   }
1101
1102   protected void shingleFilterTestCommon(ShingleFilter filter,
1103                                          Token[] tokensToCompare,
1104                                          int[] positionIncrements,
1105                                          String[] types)
1106     throws IOException {
1107     String text[] = new String[tokensToCompare.length];
1108     int startOffsets[] = new int[tokensToCompare.length];
1109     int endOffsets[] = new int[tokensToCompare.length];
1110
1111     for (int i = 0; i < tokensToCompare.length; i++) {
1112       text[i] = new String(tokensToCompare[i].buffer(),0, tokensToCompare[i].length());
1113       startOffsets[i] = tokensToCompare[i].startOffset();
1114       endOffsets[i] = tokensToCompare[i].endOffset();
1115     }
1116
1117     assertTokenStreamContents(filter, text, startOffsets, endOffsets, types, positionIncrements);
1118   }
1119
1120   private static Token createToken(String term, int start, int offset) {
1121     return createToken(term, start, offset, 1);
1122   }
1123
1124   private static Token createToken
1125     (String term, int start, int offset, int positionIncrement)
1126   {
1127     Token token = new Token(start, offset);
1128     token.copyBuffer(term.toCharArray(), 0, term.length());
1129     token.setPositionIncrement(positionIncrement);
1130     return token;
1131   }
1132 }