X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java?ds=sidebyside diff --git a/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java new file mode 100644 index 0000000..464bde0 --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java @@ -0,0 +1,541 @@ +package org.apache.lucene.analysis.shingle; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Iterator; +import java.util.LinkedList; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.util.AttributeSource; + + +/** + *
A ShingleFilter constructs shingles (token n-grams) from a token stream. + * In other words, it creates combinations of tokens as a single token. + * + *
For example, the sentence "please divide this sentence into shingles" + * might be tokenized into shingles "please divide", "divide this", + * "this sentence", "sentence into", and "into shingles". + * + *
This filter handles position increments > 1 by inserting filler tokens
+ * (tokens with termtext "_"). It does not handle a position increment of 0.
+ */
+public final class ShingleFilter extends TokenFilter {
+
+ /**
+ * filler token for when positionIncrement is more than 1
+ */
+ public static final char[] FILLER_TOKEN = { '_' };
+
+ /**
+ * default maximum shingle size is 2.
+ */
+ public static final int DEFAULT_MAX_SHINGLE_SIZE = 2;
+
+ /**
+ * default minimum shingle size is 2.
+ */
+ public static final int DEFAULT_MIN_SHINGLE_SIZE = 2;
+
+ /**
+ * default token type attribute value is "shingle"
+ */
+ public static final String DEFAULT_TOKEN_TYPE = "shingle";
+
+ /**
+ * The default string to use when joining adjacent tokens to form a shingle
+ */
+ public static final String TOKEN_SEPARATOR = " ";
+
+ /**
+ * The sequence of input stream tokens (or filler tokens, if necessary)
+ * that will be composed to form output shingles.
+ */
+ private LinkedList Shall we override the behavior of outputUnigrams==false for those
+ * times when no shingles are available (because there are fewer than
+ * minShingleSize tokens in the input stream)? (default: false.)
+ * Note that if outputUnigrams==true, then unigrams are always output,
+ * regardless of whether any shingles are available.
+ *
+ * @param outputUnigramsIfNoShingles Whether or not to output a single
+ * unigram when no shingles are available.
+ */
+ public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) {
+ this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
+ }
+
+ /**
+ * Set the max shingle size (default: 2)
+ *
+ * @param maxShingleSize max size of output shingles
+ */
+ public void setMaxShingleSize(int maxShingleSize) {
+ if (maxShingleSize < 2) {
+ throw new IllegalArgumentException("Max shingle size must be >= 2");
+ }
+ this.maxShingleSize = maxShingleSize;
+ }
+
+ /**
+ * Set the min shingle size (default: 2).
+ * This method requires that the passed in minShingleSize is not greater
+ * than maxShingleSize, so make sure that maxShingleSize is set before
+ * calling this method.
+ * The unigram output option is independent of the min shingle size.
+ *
+ * @param minShingleSize min size of output shingles
+ */
+ public void setMinShingleSize(int minShingleSize) {
+ if (minShingleSize < 2) {
+ throw new IllegalArgumentException("Min shingle size must be >= 2");
+ }
+ if (minShingleSize > maxShingleSize) {
+ throw new IllegalArgumentException
+ ("Min shingle size must be <= max shingle size");
+ }
+ this.minShingleSize = minShingleSize;
+ gramSize = new CircularSequence();
+ }
+
+ /**
+ * Sets the string to use when joining adjacent tokens to form a shingle
+ * @param tokenSeparator used to separate input stream tokens in output shingles
+ */
+ public void setTokenSeparator(String tokenSeparator) {
+ this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator;
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ boolean tokenAvailable = false;
+ int builtGramSize = 0;
+ if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) {
+ shiftInputWindow();
+ gramBuilder.setLength(0);
+ } else {
+ builtGramSize = gramSize.getPreviousValue();
+ }
+ if (inputWindow.size() >= gramSize.getValue()) {
+ boolean isAllFiller = true;
+ InputWindowToken nextToken = null;
+ Iterator Get the next token from the input stream.
+ * If the next token has Fills {@link #inputWindow} with input stream tokens, if available,
+ * shifting to the right if the window was previously full.
+ * Resets {@link #gramSize} to its minimum value.
+ *
+ * @throws IOException if there's a problem getting the next token
+ */
+ private void shiftInputWindow() throws IOException {
+ InputWindowToken firstToken = null;
+ if (inputWindow.size() > 0) {
+ firstToken = inputWindow.removeFirst();
+ }
+ while (inputWindow.size() < maxShingleSize) {
+ if (null != firstToken) { // recycle the firstToken, if available
+ if (null != getNextToken(firstToken)) {
+ inputWindow.add(firstToken); // the firstToken becomes the last
+ firstToken = null;
+ } else {
+ break; // end of input stream
+ }
+ } else {
+ InputWindowToken nextToken = getNextToken(null);
+ if (null != nextToken) {
+ inputWindow.add(nextToken);
+ } else {
+ break; // end of input stream
+ }
+ }
+ }
+ if (outputUnigramsIfNoShingles && noShingleOutput
+ && gramSize.minValue > 1 && inputWindow.size() < minShingleSize) {
+ gramSize.minValue = 1;
+ }
+ gramSize.reset();
+ isOutputHere = false;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ gramSize.reset();
+ inputWindow.clear();
+ numFillerTokensToInsert = 0;
+ isOutputHere = false;
+ noShingleOutput = true;
+ exhausted = false;
+ if (outputUnigramsIfNoShingles && ! outputUnigrams) {
+ // Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles
+ gramSize.minValue = minShingleSize;
+ }
+ }
+
+
+ /**
+ * An instance of this class is used to maintain the number of input
+ * stream tokens that will be used to compose the next unigram or shingle:
+ * {@link #gramSize}.
+ * 1 is included in the circular sequence only if
+ * {@link #outputUnigrams} = true.
+ */
+ private class CircularSequence {
+ private int value;
+ private int previousValue;
+ private int minValue;
+
+ public CircularSequence() {
+ minValue = outputUnigrams ? 1 : minShingleSize;
+ reset();
+ }
+
+ /**
+ * @return the current value.
+ * @see #advance()
+ */
+ public int getValue() {
+ return value;
+ }
+
+ /**
+ * Increments this circular number's value to the next member in the
+ * circular sequence
+ * 1 is included in the circular sequence only if
+ * {@link #outputUnigrams} = true.
+ */
+ public void advance() {
+ previousValue = value;
+ if (value == 1) {
+ value = minShingleSize;
+ } else if (value == maxShingleSize) {
+ reset();
+ } else {
+ ++value;
+ }
+ }
+
+ /**
+ * Sets this circular number's value to the first member of the
+ * circular sequence
+ * 1 is included in the circular sequence only if
+ * {@link #outputUnigrams} = true.
+ */
+ public void reset() {
+ previousValue = value = minValue;
+ }
+
+ /**
+ * Returns true if the current value is the first member of the circular
+ * sequence.
+ * If {@link #outputUnigrams} = true, the first member of the circular
+ * sequence will be 1; otherwise, it will be {@link #minShingleSize}.
+ *
+ * @return true if the current value is the first member of the circular
+ * sequence; false otherwise
+ */
+ public boolean atMinValue() {
+ return value == minValue;
+ }
+
+ /**
+ * @return the value this instance had before the last advance() call
+ */
+ public int getPreviousValue() {
+ return previousValue;
+ }
+ }
+
+ private class InputWindowToken {
+ final AttributeSource attSource;
+ final CharTermAttribute termAtt;
+ final OffsetAttribute offsetAtt;
+ boolean isFiller = false;
+
+ public InputWindowToken(AttributeSource attSource) {
+ this.attSource = attSource;
+ this.termAtt = attSource.getAttribute(CharTermAttribute.class);
+ this.offsetAtt = attSource.getAttribute(OffsetAttribute.class);
+ }
+ }
+}
input
+ *
+ * @param input input stream
+ * @param minShingleSize minimum shingle size produced by the filter.
+ * @param maxShingleSize maximum shingle size produced by the filter.
+ */
+ public ShingleFilter(TokenStream input, int minShingleSize, int maxShingleSize) {
+ super(input);
+ setMaxShingleSize(maxShingleSize);
+ setMinShingleSize(minShingleSize);
+ }
+
+ /**
+ * Constructs a ShingleFilter with the specified shingle size from the
+ * {@link TokenStream} input
+ *
+ * @param input input stream
+ * @param maxShingleSize maximum shingle size produced by the filter.
+ */
+ public ShingleFilter(TokenStream input, int maxShingleSize) {
+ this(input, DEFAULT_MIN_SHINGLE_SIZE, maxShingleSize);
+ }
+
+ /**
+ * Construct a ShingleFilter with default shingle size: 2.
+ *
+ * @param input input stream
+ */
+ public ShingleFilter(TokenStream input) {
+ this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE);
+ }
+
+ /**
+ * Construct a ShingleFilter with the specified token type for shingle tokens
+ * and the default shingle size: 2
+ *
+ * @param input input stream
+ * @param tokenType token type for shingle tokens
+ */
+ public ShingleFilter(TokenStream input, String tokenType) {
+ this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE);
+ setTokenType(tokenType);
+ }
+
+ /**
+ * Set the type of the shingle tokens produced by this filter.
+ * (default: "shingle")
+ *
+ * @param tokenType token tokenType
+ */
+ public void setTokenType(String tokenType) {
+ this.tokenType = tokenType;
+ }
+
+ /**
+ * Shall the output stream contain the input tokens (unigrams) as well as
+ * shingles? (default: true.)
+ *
+ * @param outputUnigrams Whether or not the output stream shall contain
+ * the input tokens (unigrams)
+ */
+ public void setOutputUnigrams(boolean outputUnigrams) {
+ this.outputUnigrams = outputUnigrams;
+ gramSize = new CircularSequence();
+ }
+
+ /**
+ * positionIncrement > 1
,
+ * positionIncrement - 1
{@link #FILLER_TOKEN}s are
+ * inserted first.
+ * @param target Where to put the new token; if null, a new instance is created.
+ * @return On success, the populated token; null otherwise
+ * @throws IOException if the input stream has a problem
+ */
+ private InputWindowToken getNextToken(InputWindowToken target)
+ throws IOException {
+ InputWindowToken newTarget = target;
+ if (numFillerTokensToInsert > 0) {
+ if (null == target) {
+ newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes());
+ } else {
+ nextInputStreamToken.copyTo(target.attSource);
+ }
+ // A filler token occupies no space
+ newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(),
+ newTarget.offsetAtt.startOffset());
+ newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
+ newTarget.isFiller = true;
+ --numFillerTokensToInsert;
+ } else if (isNextInputStreamToken) {
+ if (null == target) {
+ newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes());
+ } else {
+ nextInputStreamToken.copyTo(target.attSource);
+ }
+ isNextInputStreamToken = false;
+ newTarget.isFiller = false;
+ } else if (!exhausted && input.incrementToken()) {
+ if (null == target) {
+ newTarget = new InputWindowToken(cloneAttributes());
+ } else {
+ this.copyTo(target.attSource);
+ }
+ if (posIncrAtt.getPositionIncrement() > 1) {
+ // Each output shingle must contain at least one input token,
+ // so no more than (maxShingleSize - 1) filler tokens will be inserted.
+ numFillerTokensToInsert
+ = Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1);
+ // Save the current token as the next input stream token
+ if (null == nextInputStreamToken) {
+ nextInputStreamToken = cloneAttributes();
+ } else {
+ this.copyTo(nextInputStreamToken);
+ }
+ isNextInputStreamToken = true;
+ // A filler token occupies no space
+ newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
+ newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
+ newTarget.isFiller = true;
+ --numFillerTokensToInsert;
+ } else {
+ newTarget.isFiller = false;
+ }
+ } else {
+ newTarget = null;
+ exhausted = true;
+ }
+ return newTarget;
+ }
+
+ /**
+ * gramSize
will take on values from the circular sequence
+ * { [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }.
+ * gramSize
will take on values from the circular sequence
+ * { [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }.
+ * gramSize
will take on values from the circular sequence
+ * { [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }.
+ *