1 package org.apache.lucene.analysis.shingle;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.util.Iterator;
22 import java.util.LinkedList;
24 import org.apache.lucene.analysis.TokenFilter;
25 import org.apache.lucene.analysis.TokenStream;
26 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
27 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
28 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
29 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
30 import org.apache.lucene.util.AttributeSource;
34 * <p>A ShingleFilter constructs shingles (token n-grams) from a token stream.
35 * In other words, it creates combinations of tokens as a single token.
37 * <p>For example, the sentence "please divide this sentence into shingles"
38 * might be tokenized into shingles "please divide", "divide this",
39 * "this sentence", "sentence into", and "into shingles".
41 * <p>This filter handles position increments > 1 by inserting filler tokens
42 * (tokens with termtext "_"). It does not handle a position increment of 0.
44 public final class ShingleFilter extends TokenFilter {
47 * filler token for when positionIncrement is more than 1
49 public static final char[] FILLER_TOKEN = { '_' };
52 * default maximum shingle size is 2.
54 public static final int DEFAULT_MAX_SHINGLE_SIZE = 2;
57 * default minimum shingle size is 2.
59 public static final int DEFAULT_MIN_SHINGLE_SIZE = 2;
62 * default token type attribute value is "shingle"
64 public static final String DEFAULT_TOKEN_TYPE = "shingle";
67 * The default string to use when joining adjacent tokens to form a shingle
69 public static final String TOKEN_SEPARATOR = " ";
72 * The sequence of input stream tokens (or filler tokens, if necessary)
73 * that will be composed to form output shingles.
75 private LinkedList<InputWindowToken> inputWindow
76 = new LinkedList<InputWindowToken>();
79 * The number of input tokens in the next output token. This is the "n" in
82 private CircularSequence gramSize;
85 * Shingle and unigram text is composed here.
87 private StringBuilder gramBuilder = new StringBuilder();
90 * The token type attribute value to use - default is "shingle"
92 private String tokenType = DEFAULT_TOKEN_TYPE;
95 * The string to use when joining adjacent tokens to form a shingle
97 private String tokenSeparator = TOKEN_SEPARATOR;
100 * By default, we output unigrams (individual tokens) as well as shingles
103 private boolean outputUnigrams = true;
106 * By default, we don't override behavior of outputUnigrams.
108 private boolean outputUnigramsIfNoShingles = false;
111 * maximum shingle size (number of tokens)
113 private int maxShingleSize;
116 * minimum shingle size (number of tokens)
118 private int minShingleSize;
121 * The remaining number of filler tokens to be inserted into the input stream
122 * from which shingles are composed, to handle position increments greater
125 private int numFillerTokensToInsert;
128 * When the next input stream token has a position increment greater than
129 * one, it is stored in this field until sufficient filler tokens have been
130 * inserted to account for the position increment.
132 private AttributeSource nextInputStreamToken;
135 * Whether or not there is a next input stream token.
137 private boolean isNextInputStreamToken = false;
140 * Whether at least one unigram or shingle has been output at the current
143 private boolean isOutputHere = false;
146 * true if no shingles have been output yet (for outputUnigramsIfNoShingles).
148 boolean noShingleOutput = true;
150 private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
151 private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
152 private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
153 private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
157 * Constructs a ShingleFilter with the specified shingle size from the
158 * {@link TokenStream} <code>input</code>
160 * @param input input stream
161 * @param minShingleSize minimum shingle size produced by the filter.
162 * @param maxShingleSize maximum shingle size produced by the filter.
164 public ShingleFilter(TokenStream input, int minShingleSize, int maxShingleSize) {
166 setMaxShingleSize(maxShingleSize);
167 setMinShingleSize(minShingleSize);
171 * Constructs a ShingleFilter with the specified shingle size from the
172 * {@link TokenStream} <code>input</code>
174 * @param input input stream
175 * @param maxShingleSize maximum shingle size produced by the filter.
177 public ShingleFilter(TokenStream input, int maxShingleSize) {
178 this(input, DEFAULT_MIN_SHINGLE_SIZE, maxShingleSize);
182 * Construct a ShingleFilter with default shingle size: 2.
184 * @param input input stream
186 public ShingleFilter(TokenStream input) {
187 this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE);
191 * Construct a ShingleFilter with the specified token type for shingle tokens
192 * and the default shingle size: 2
194 * @param input input stream
195 * @param tokenType token type for shingle tokens
197 public ShingleFilter(TokenStream input, String tokenType) {
198 this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE);
199 setTokenType(tokenType);
203 * Set the type of the shingle tokens produced by this filter.
204 * (default: "shingle")
206 * @param tokenType token tokenType
208 public void setTokenType(String tokenType) {
209 this.tokenType = tokenType;
213 * Shall the output stream contain the input tokens (unigrams) as well as
214 * shingles? (default: true.)
216 * @param outputUnigrams Whether or not the output stream shall contain
217 * the input tokens (unigrams)
219 public void setOutputUnigrams(boolean outputUnigrams) {
220 this.outputUnigrams = outputUnigrams;
221 gramSize = new CircularSequence();
225 * <p>Shall we override the behavior of outputUnigrams==false for those
226 * times when no shingles are available (because there are fewer than
227 * minShingleSize tokens in the input stream)? (default: false.)
228 * <p>Note that if outputUnigrams==true, then unigrams are always output,
229 * regardless of whether any shingles are available.
231 * @param outputUnigramsIfNoShingles Whether or not to output a single
232 * unigram when no shingles are available.
234 public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) {
235 this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
239 * Set the max shingle size (default: 2)
241 * @param maxShingleSize max size of output shingles
243 public void setMaxShingleSize(int maxShingleSize) {
244 if (maxShingleSize < 2) {
245 throw new IllegalArgumentException("Max shingle size must be >= 2");
247 this.maxShingleSize = maxShingleSize;
251 * <p>Set the min shingle size (default: 2).
252 * <p>This method requires that the passed in minShingleSize is not greater
253 * than maxShingleSize, so make sure that maxShingleSize is set before
254 * calling this method.
255 * <p>The unigram output option is independent of the min shingle size.
257 * @param minShingleSize min size of output shingles
259 public void setMinShingleSize(int minShingleSize) {
260 if (minShingleSize < 2) {
261 throw new IllegalArgumentException("Min shingle size must be >= 2");
263 if (minShingleSize > maxShingleSize) {
264 throw new IllegalArgumentException
265 ("Min shingle size must be <= max shingle size");
267 this.minShingleSize = minShingleSize;
268 gramSize = new CircularSequence();
272 * Sets the string to use when joining adjacent tokens to form a shingle
273 * @param tokenSeparator used to separate input stream tokens in output shingles
275 public void setTokenSeparator(String tokenSeparator) {
276 this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator;
280 public final boolean incrementToken() throws IOException {
281 boolean tokenAvailable = false;
282 int builtGramSize = 0;
283 if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) {
285 gramBuilder.setLength(0);
287 builtGramSize = gramSize.getPreviousValue();
289 if (inputWindow.size() >= gramSize.getValue()) {
290 boolean isAllFiller = true;
291 InputWindowToken nextToken = null;
292 Iterator<InputWindowToken> iter = inputWindow.iterator();
293 for (int gramNum = 1 ;
294 iter.hasNext() && builtGramSize < gramSize.getValue() ;
296 nextToken = iter.next();
297 if (builtGramSize < gramNum) {
298 if (builtGramSize > 0) {
299 gramBuilder.append(tokenSeparator);
301 gramBuilder.append(nextToken.termAtt.buffer(), 0,
302 nextToken.termAtt.length());
305 if (isAllFiller && nextToken.isFiller) {
306 if (gramNum == gramSize.getValue()) {
313 if ( ! isAllFiller && builtGramSize == gramSize.getValue()) {
314 inputWindow.getFirst().attSource.copyTo(this);
315 posIncrAtt.setPositionIncrement(isOutputHere ? 0 : 1);
316 termAtt.setEmpty().append(gramBuilder);
317 if (gramSize.getValue() > 1) {
318 typeAtt.setType(tokenType);
319 noShingleOutput = false;
321 offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
324 tokenAvailable = true;
327 return tokenAvailable;
330 private boolean exhausted;
333 * <p>Get the next token from the input stream.
334 * <p>If the next token has <code>positionIncrement > 1</code>,
335 * <code>positionIncrement - 1</code> {@link #FILLER_TOKEN}s are
337 * @param target Where to put the new token; if null, a new instance is created.
338 * @return On success, the populated token; null otherwise
339 * @throws IOException if the input stream has a problem
341 private InputWindowToken getNextToken(InputWindowToken target)
343 InputWindowToken newTarget = target;
344 if (numFillerTokensToInsert > 0) {
345 if (null == target) {
346 newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes());
348 nextInputStreamToken.copyTo(target.attSource);
350 // A filler token occupies no space
351 newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(),
352 newTarget.offsetAtt.startOffset());
353 newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
354 newTarget.isFiller = true;
355 --numFillerTokensToInsert;
356 } else if (isNextInputStreamToken) {
357 if (null == target) {
358 newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes());
360 nextInputStreamToken.copyTo(target.attSource);
362 isNextInputStreamToken = false;
363 newTarget.isFiller = false;
364 } else if (!exhausted && input.incrementToken()) {
365 if (null == target) {
366 newTarget = new InputWindowToken(cloneAttributes());
368 this.copyTo(target.attSource);
370 if (posIncrAtt.getPositionIncrement() > 1) {
371 // Each output shingle must contain at least one input token,
372 // so no more than (maxShingleSize - 1) filler tokens will be inserted.
373 numFillerTokensToInsert
374 = Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1);
375 // Save the current token as the next input stream token
376 if (null == nextInputStreamToken) {
377 nextInputStreamToken = cloneAttributes();
379 this.copyTo(nextInputStreamToken);
381 isNextInputStreamToken = true;
382 // A filler token occupies no space
383 newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
384 newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
385 newTarget.isFiller = true;
386 --numFillerTokensToInsert;
388 newTarget.isFiller = false;
398 * <p>Fills {@link #inputWindow} with input stream tokens, if available,
399 * shifting to the right if the window was previously full.
400 * <p>Resets {@link #gramSize} to its minimum value.
402 * @throws IOException if there's a problem getting the next token
404 private void shiftInputWindow() throws IOException {
405 InputWindowToken firstToken = null;
406 if (inputWindow.size() > 0) {
407 firstToken = inputWindow.removeFirst();
409 while (inputWindow.size() < maxShingleSize) {
410 if (null != firstToken) { // recycle the firstToken, if available
411 if (null != getNextToken(firstToken)) {
412 inputWindow.add(firstToken); // the firstToken becomes the last
415 break; // end of input stream
418 InputWindowToken nextToken = getNextToken(null);
419 if (null != nextToken) {
420 inputWindow.add(nextToken);
422 break; // end of input stream
426 if (outputUnigramsIfNoShingles && noShingleOutput
427 && gramSize.minValue > 1 && inputWindow.size() < minShingleSize) {
428 gramSize.minValue = 1;
431 isOutputHere = false;
435 public void reset() throws IOException {
439 numFillerTokensToInsert = 0;
440 isOutputHere = false;
441 noShingleOutput = true;
443 if (outputUnigramsIfNoShingles && ! outputUnigrams) {
444 // Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles
445 gramSize.minValue = minShingleSize;
451 * <p>An instance of this class is used to maintain the number of input
452 * stream tokens that will be used to compose the next unigram or shingle:
454 * <p><code>gramSize</code> will take on values from the circular sequence
455 * <b>{ [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }</b>.
456 * <p>1 is included in the circular sequence only if
457 * {@link #outputUnigrams} = true.
459 private class CircularSequence {
461 private int previousValue;
462 private int minValue;
464 public CircularSequence() {
465 minValue = outputUnigrams ? 1 : minShingleSize;
470 * @return the current value.
473 public int getValue() {
478 * <p>Increments this circular number's value to the next member in the
480 * <code>gramSize</code> will take on values from the circular sequence
481 * <b>{ [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }</b>.
482 * <p>1 is included in the circular sequence only if
483 * {@link #outputUnigrams} = true.
485 public void advance() {
486 previousValue = value;
488 value = minShingleSize;
489 } else if (value == maxShingleSize) {
497 * <p>Sets this circular number's value to the first member of the
499 * <p><code>gramSize</code> will take on values from the circular sequence
500 * <b>{ [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }</b>.
501 * <p>1 is included in the circular sequence only if
502 * {@link #outputUnigrams} = true.
504 public void reset() {
505 previousValue = value = minValue;
509 * <p>Returns true if the current value is the first member of the circular
511 * <p>If {@link #outputUnigrams} = true, the first member of the circular
512 * sequence will be 1; otherwise, it will be {@link #minShingleSize}.
514 * @return true if the current value is the first member of the circular
515 * sequence; false otherwise
517 public boolean atMinValue() {
518 return value == minValue;
522 * @return the value this instance had before the last advance() call
524 public int getPreviousValue() {
525 return previousValue;
529 private class InputWindowToken {
530 final AttributeSource attSource;
531 final CharTermAttribute termAtt;
532 final OffsetAttribute offsetAtt;
533 boolean isFiller = false;
535 public InputWindowToken(AttributeSource attSource) {
536 this.attSource = attSource;
537 this.termAtt = attSource.getAttribute(CharTermAttribute.class);
538 this.offsetAtt = attSource.getAttribute(OffsetAttribute.class);