X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java?ds=sidebyside diff --git a/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java new file mode 100644 index 0000000..464bde0 --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java @@ -0,0 +1,541 @@ +package org.apache.lucene.analysis.shingle; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Iterator; +import java.util.LinkedList; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.util.AttributeSource; + + +/** + *

A ShingleFilter constructs shingles (token n-grams) from a token stream. + * In other words, it creates combinations of tokens as a single token. + * + *

For example, the sentence "please divide this sentence into shingles" + * might be tokenized into shingles "please divide", "divide this", + * "this sentence", "sentence into", and "into shingles". + * + *

This filter handles position increments > 1 by inserting filler tokens + * (tokens with termtext "_"). It does not handle a position increment of 0. + */ +public final class ShingleFilter extends TokenFilter { + + /** + * filler token for when positionIncrement is more than 1 + */ + public static final char[] FILLER_TOKEN = { '_' }; + + /** + * default maximum shingle size is 2. + */ + public static final int DEFAULT_MAX_SHINGLE_SIZE = 2; + + /** + * default minimum shingle size is 2. + */ + public static final int DEFAULT_MIN_SHINGLE_SIZE = 2; + + /** + * default token type attribute value is "shingle" + */ + public static final String DEFAULT_TOKEN_TYPE = "shingle"; + + /** + * The default string to use when joining adjacent tokens to form a shingle + */ + public static final String TOKEN_SEPARATOR = " "; + + /** + * The sequence of input stream tokens (or filler tokens, if necessary) + * that will be composed to form output shingles. + */ + private LinkedList inputWindow + = new LinkedList(); + + /** + * The number of input tokens in the next output token. This is the "n" in + * "token n-grams". + */ + private CircularSequence gramSize; + + /** + * Shingle and unigram text is composed here. + */ + private StringBuilder gramBuilder = new StringBuilder(); + + /** + * The token type attribute value to use - default is "shingle" + */ + private String tokenType = DEFAULT_TOKEN_TYPE; + + /** + * The string to use when joining adjacent tokens to form a shingle + */ + private String tokenSeparator = TOKEN_SEPARATOR; + + /** + * By default, we output unigrams (individual tokens) as well as shingles + * (token n-grams). + */ + private boolean outputUnigrams = true; + + /** + * By default, we don't override behavior of outputUnigrams. + */ + private boolean outputUnigramsIfNoShingles = false; + + /** + * maximum shingle size (number of tokens) + */ + private int maxShingleSize; + + /** + * minimum shingle size (number of tokens) + */ + private int minShingleSize; + + /** + * The remaining number of filler tokens to be inserted into the input stream + * from which shingles are composed, to handle position increments greater + * than one. + */ + private int numFillerTokensToInsert; + + /** + * When the next input stream token has a position increment greater than + * one, it is stored in this field until sufficient filler tokens have been + * inserted to account for the position increment. + */ + private AttributeSource nextInputStreamToken; + + /** + * Whether or not there is a next input stream token. + */ + private boolean isNextInputStreamToken = false; + + /** + * Whether at least one unigram or shingle has been output at the current + * position. + */ + private boolean isOutputHere = false; + + /** + * true if no shingles have been output yet (for outputUnigramsIfNoShingles). + */ + boolean noShingleOutput = true; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + + + /** + * Constructs a ShingleFilter with the specified shingle size from the + * {@link TokenStream} input + * + * @param input input stream + * @param minShingleSize minimum shingle size produced by the filter. + * @param maxShingleSize maximum shingle size produced by the filter. + */ + public ShingleFilter(TokenStream input, int minShingleSize, int maxShingleSize) { + super(input); + setMaxShingleSize(maxShingleSize); + setMinShingleSize(minShingleSize); + } + + /** + * Constructs a ShingleFilter with the specified shingle size from the + * {@link TokenStream} input + * + * @param input input stream + * @param maxShingleSize maximum shingle size produced by the filter. + */ + public ShingleFilter(TokenStream input, int maxShingleSize) { + this(input, DEFAULT_MIN_SHINGLE_SIZE, maxShingleSize); + } + + /** + * Construct a ShingleFilter with default shingle size: 2. + * + * @param input input stream + */ + public ShingleFilter(TokenStream input) { + this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE); + } + + /** + * Construct a ShingleFilter with the specified token type for shingle tokens + * and the default shingle size: 2 + * + * @param input input stream + * @param tokenType token type for shingle tokens + */ + public ShingleFilter(TokenStream input, String tokenType) { + this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE); + setTokenType(tokenType); + } + + /** + * Set the type of the shingle tokens produced by this filter. + * (default: "shingle") + * + * @param tokenType token tokenType + */ + public void setTokenType(String tokenType) { + this.tokenType = tokenType; + } + + /** + * Shall the output stream contain the input tokens (unigrams) as well as + * shingles? (default: true.) + * + * @param outputUnigrams Whether or not the output stream shall contain + * the input tokens (unigrams) + */ + public void setOutputUnigrams(boolean outputUnigrams) { + this.outputUnigrams = outputUnigrams; + gramSize = new CircularSequence(); + } + + /** + *

Shall we override the behavior of outputUnigrams==false for those + * times when no shingles are available (because there are fewer than + * minShingleSize tokens in the input stream)? (default: false.) + *

Note that if outputUnigrams==true, then unigrams are always output, + * regardless of whether any shingles are available. + * + * @param outputUnigramsIfNoShingles Whether or not to output a single + * unigram when no shingles are available. + */ + public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) { + this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles; + } + + /** + * Set the max shingle size (default: 2) + * + * @param maxShingleSize max size of output shingles + */ + public void setMaxShingleSize(int maxShingleSize) { + if (maxShingleSize < 2) { + throw new IllegalArgumentException("Max shingle size must be >= 2"); + } + this.maxShingleSize = maxShingleSize; + } + + /** + *

Set the min shingle size (default: 2). + *

This method requires that the passed in minShingleSize is not greater + * than maxShingleSize, so make sure that maxShingleSize is set before + * calling this method. + *

The unigram output option is independent of the min shingle size. + * + * @param minShingleSize min size of output shingles + */ + public void setMinShingleSize(int minShingleSize) { + if (minShingleSize < 2) { + throw new IllegalArgumentException("Min shingle size must be >= 2"); + } + if (minShingleSize > maxShingleSize) { + throw new IllegalArgumentException + ("Min shingle size must be <= max shingle size"); + } + this.minShingleSize = minShingleSize; + gramSize = new CircularSequence(); + } + + /** + * Sets the string to use when joining adjacent tokens to form a shingle + * @param tokenSeparator used to separate input stream tokens in output shingles + */ + public void setTokenSeparator(String tokenSeparator) { + this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator; + } + + @Override + public final boolean incrementToken() throws IOException { + boolean tokenAvailable = false; + int builtGramSize = 0; + if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) { + shiftInputWindow(); + gramBuilder.setLength(0); + } else { + builtGramSize = gramSize.getPreviousValue(); + } + if (inputWindow.size() >= gramSize.getValue()) { + boolean isAllFiller = true; + InputWindowToken nextToken = null; + Iterator iter = inputWindow.iterator(); + for (int gramNum = 1 ; + iter.hasNext() && builtGramSize < gramSize.getValue() ; + ++gramNum) { + nextToken = iter.next(); + if (builtGramSize < gramNum) { + if (builtGramSize > 0) { + gramBuilder.append(tokenSeparator); + } + gramBuilder.append(nextToken.termAtt.buffer(), 0, + nextToken.termAtt.length()); + ++builtGramSize; + } + if (isAllFiller && nextToken.isFiller) { + if (gramNum == gramSize.getValue()) { + gramSize.advance(); + } + } else { + isAllFiller = false; + } + } + if ( ! isAllFiller && builtGramSize == gramSize.getValue()) { + inputWindow.getFirst().attSource.copyTo(this); + posIncrAtt.setPositionIncrement(isOutputHere ? 0 : 1); + termAtt.setEmpty().append(gramBuilder); + if (gramSize.getValue() > 1) { + typeAtt.setType(tokenType); + noShingleOutput = false; + } + offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset()); + isOutputHere = true; + gramSize.advance(); + tokenAvailable = true; + } + } + return tokenAvailable; + } + + private boolean exhausted; + + /** + *

Get the next token from the input stream. + *

If the next token has positionIncrement > 1, + * positionIncrement - 1 {@link #FILLER_TOKEN}s are + * inserted first. + * @param target Where to put the new token; if null, a new instance is created. + * @return On success, the populated token; null otherwise + * @throws IOException if the input stream has a problem + */ + private InputWindowToken getNextToken(InputWindowToken target) + throws IOException { + InputWindowToken newTarget = target; + if (numFillerTokensToInsert > 0) { + if (null == target) { + newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes()); + } else { + nextInputStreamToken.copyTo(target.attSource); + } + // A filler token occupies no space + newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(), + newTarget.offsetAtt.startOffset()); + newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length); + newTarget.isFiller = true; + --numFillerTokensToInsert; + } else if (isNextInputStreamToken) { + if (null == target) { + newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes()); + } else { + nextInputStreamToken.copyTo(target.attSource); + } + isNextInputStreamToken = false; + newTarget.isFiller = false; + } else if (!exhausted && input.incrementToken()) { + if (null == target) { + newTarget = new InputWindowToken(cloneAttributes()); + } else { + this.copyTo(target.attSource); + } + if (posIncrAtt.getPositionIncrement() > 1) { + // Each output shingle must contain at least one input token, + // so no more than (maxShingleSize - 1) filler tokens will be inserted. + numFillerTokensToInsert + = Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1); + // Save the current token as the next input stream token + if (null == nextInputStreamToken) { + nextInputStreamToken = cloneAttributes(); + } else { + this.copyTo(nextInputStreamToken); + } + isNextInputStreamToken = true; + // A filler token occupies no space + newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset()); + newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length); + newTarget.isFiller = true; + --numFillerTokensToInsert; + } else { + newTarget.isFiller = false; + } + } else { + newTarget = null; + exhausted = true; + } + return newTarget; + } + + /** + *

Fills {@link #inputWindow} with input stream tokens, if available, + * shifting to the right if the window was previously full. + *

Resets {@link #gramSize} to its minimum value. + * + * @throws IOException if there's a problem getting the next token + */ + private void shiftInputWindow() throws IOException { + InputWindowToken firstToken = null; + if (inputWindow.size() > 0) { + firstToken = inputWindow.removeFirst(); + } + while (inputWindow.size() < maxShingleSize) { + if (null != firstToken) { // recycle the firstToken, if available + if (null != getNextToken(firstToken)) { + inputWindow.add(firstToken); // the firstToken becomes the last + firstToken = null; + } else { + break; // end of input stream + } + } else { + InputWindowToken nextToken = getNextToken(null); + if (null != nextToken) { + inputWindow.add(nextToken); + } else { + break; // end of input stream + } + } + } + if (outputUnigramsIfNoShingles && noShingleOutput + && gramSize.minValue > 1 && inputWindow.size() < minShingleSize) { + gramSize.minValue = 1; + } + gramSize.reset(); + isOutputHere = false; + } + + @Override + public void reset() throws IOException { + super.reset(); + gramSize.reset(); + inputWindow.clear(); + numFillerTokensToInsert = 0; + isOutputHere = false; + noShingleOutput = true; + exhausted = false; + if (outputUnigramsIfNoShingles && ! outputUnigrams) { + // Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles + gramSize.minValue = minShingleSize; + } + } + + + /** + *

An instance of this class is used to maintain the number of input + * stream tokens that will be used to compose the next unigram or shingle: + * {@link #gramSize}. + *

gramSize will take on values from the circular sequence + * { [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }. + *

1 is included in the circular sequence only if + * {@link #outputUnigrams} = true. + */ + private class CircularSequence { + private int value; + private int previousValue; + private int minValue; + + public CircularSequence() { + minValue = outputUnigrams ? 1 : minShingleSize; + reset(); + } + + /** + * @return the current value. + * @see #advance() + */ + public int getValue() { + return value; + } + + /** + *

Increments this circular number's value to the next member in the + * circular sequence + * gramSize will take on values from the circular sequence + * { [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }. + *

1 is included in the circular sequence only if + * {@link #outputUnigrams} = true. + */ + public void advance() { + previousValue = value; + if (value == 1) { + value = minShingleSize; + } else if (value == maxShingleSize) { + reset(); + } else { + ++value; + } + } + + /** + *

Sets this circular number's value to the first member of the + * circular sequence + *

gramSize will take on values from the circular sequence + * { [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }. + *

1 is included in the circular sequence only if + * {@link #outputUnigrams} = true. + */ + public void reset() { + previousValue = value = minValue; + } + + /** + *

Returns true if the current value is the first member of the circular + * sequence. + *

If {@link #outputUnigrams} = true, the first member of the circular + * sequence will be 1; otherwise, it will be {@link #minShingleSize}. + * + * @return true if the current value is the first member of the circular + * sequence; false otherwise + */ + public boolean atMinValue() { + return value == minValue; + } + + /** + * @return the value this instance had before the last advance() call + */ + public int getPreviousValue() { + return previousValue; + } + } + + private class InputWindowToken { + final AttributeSource attSource; + final CharTermAttribute termAtt; + final OffsetAttribute offsetAtt; + boolean isFiller = false; + + public InputWindowToken(AttributeSource attSource) { + this.attSource = attSource; + this.termAtt = attSource.getAttribute(CharTermAttribute.class); + this.offsetAtt = attSource.getAttribute(OffsetAttribute.class); + } + } +}