1 package org.apache.lucene.analysis.ngram;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import org.apache.lucene.analysis.TokenFilter;
21 import org.apache.lucene.analysis.TokenStream;
22 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
23 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
25 import java.io.IOException;
28 * Tokenizes the given token into n-grams of given size(s).
30 * This {@link TokenFilter} create n-grams from the beginning edge or ending edge of a input token.
33 public final class EdgeNGramTokenFilter extends TokenFilter {
34 public static final Side DEFAULT_SIDE = Side.FRONT;
35 public static final int DEFAULT_MAX_GRAM_SIZE = 1;
36 public static final int DEFAULT_MIN_GRAM_SIZE = 1;
38 /** Specifies which side of the input the n-gram should be generated from */
39 public static enum Side {
41 /** Get the n-gram from the front of the input */
44 public String getLabel() { return "front"; }
47 /** Get the n-gram from the end of the input */
50 public String getLabel() { return "back"; }
53 public abstract String getLabel();
55 // Get the appropriate Side from a string
56 public static Side getSide(String sideName) {
57 if (FRONT.getLabel().equals(sideName)) {
60 if (BACK.getLabel().equals(sideName)) {
67 private final int minGram;
68 private final int maxGram;
70 private char[] curTermBuffer;
71 private int curTermLength;
72 private int curGramSize;
75 private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
76 private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
79 * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
81 * @param input {@link TokenStream} holding the input to be tokenized
82 * @param side the {@link Side} from which to chop off an n-gram
83 * @param minGram the smallest n-gram to generate
84 * @param maxGram the largest n-gram to generate
86 public EdgeNGramTokenFilter(TokenStream input, Side side, int minGram, int maxGram) {
90 throw new IllegalArgumentException("sideLabel must be either front or back");
94 throw new IllegalArgumentException("minGram must be greater than zero");
97 if (minGram > maxGram) {
98 throw new IllegalArgumentException("minGram must not be greater than maxGram");
101 this.minGram = minGram;
102 this.maxGram = maxGram;
107 * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
109 * @param input {@link TokenStream} holding the input to be tokenized
110 * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
111 * @param minGram the smallest n-gram to generate
112 * @param maxGram the largest n-gram to generate
114 public EdgeNGramTokenFilter(TokenStream input, String sideLabel, int minGram, int maxGram) {
115 this(input, Side.getSide(sideLabel), minGram, maxGram);
119 public final boolean incrementToken() throws IOException {
121 if (curTermBuffer == null) {
122 if (!input.incrementToken()) {
125 curTermBuffer = termAtt.buffer().clone();
126 curTermLength = termAtt.length();
127 curGramSize = minGram;
128 tokStart = offsetAtt.startOffset();
131 if (curGramSize <= maxGram) {
132 if (! (curGramSize > curTermLength // if the remaining input is too short, we can't generate any n-grams
133 || curGramSize > maxGram)) { // if we have hit the end of our n-gram size range, quit
134 // grab gramSize chars from front or back
135 int start = side == Side.FRONT ? 0 : curTermLength - curGramSize;
136 int end = start + curGramSize;
138 offsetAtt.setOffset(tokStart + start, tokStart + end);
139 termAtt.copyBuffer(curTermBuffer, start, curGramSize);
144 curTermBuffer = null;
149 public void reset() throws IOException {
151 curTermBuffer = null;