1 package org.apache.lucene.analysis.ngram;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import org.apache.lucene.analysis.Tokenizer;
21 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
22 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
23 import org.apache.lucene.util.AttributeSource;
25 import java.io.IOException;
26 import java.io.Reader;
29 * Tokenizes the input into n-grams of the given size(s).
31 public final class NGramTokenizer extends Tokenizer {
32 public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
33 public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
35 private int minGram, maxGram;
40 private boolean started = false;
42 private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
43 private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
46 * Creates NGramTokenizer with given min and max n-grams.
47 * @param input {@link Reader} holding the input to be tokenized
48 * @param minGram the smallest n-gram to generate
49 * @param maxGram the largest n-gram to generate
51 public NGramTokenizer(Reader input, int minGram, int maxGram) {
53 init(minGram, maxGram);
57 * Creates NGramTokenizer with given min and max n-grams.
58 * @param source {@link AttributeSource} to use
59 * @param input {@link Reader} holding the input to be tokenized
60 * @param minGram the smallest n-gram to generate
61 * @param maxGram the largest n-gram to generate
63 public NGramTokenizer(AttributeSource source, Reader input, int minGram, int maxGram) {
65 init(minGram, maxGram);
69 * Creates NGramTokenizer with given min and max n-grams.
70 * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
71 * @param input {@link Reader} holding the input to be tokenized
72 * @param minGram the smallest n-gram to generate
73 * @param maxGram the largest n-gram to generate
75 public NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram) {
76 super(factory, input);
77 init(minGram, maxGram);
81 * Creates NGramTokenizer with default min and max n-grams.
82 * @param input {@link Reader} holding the input to be tokenized
84 public NGramTokenizer(Reader input) {
85 this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
88 private void init(int minGram, int maxGram) {
90 throw new IllegalArgumentException("minGram must be greater than zero");
92 if (minGram > maxGram) {
93 throw new IllegalArgumentException("minGram must not be greater than maxGram");
95 this.minGram = minGram;
96 this.maxGram = maxGram;
99 /** Returns the next token in the stream, or null at EOS. */
101 public final boolean incrementToken() throws IOException {
106 char[] chars = new char[1024];
108 inStr = new String(chars).trim(); // remove any trailing empty strings
109 inLen = inStr.length();
112 if (pos+gramSize > inLen) { // if we hit the end of the string
113 pos = 0; // reset to beginning of string
114 gramSize++; // increase n-gram size
115 if (gramSize > maxGram) // we are done
117 if (pos+gramSize > inLen)
123 termAtt.setEmpty().append(inStr, oldPos, oldPos+gramSize);
124 offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize));
129 public final void end() {
131 final int finalOffset = inLen;
132 this.offsetAtt.setOffset(finalOffset, finalOffset);
136 public void reset(Reader input) throws IOException {
142 public void reset() throws IOException {