X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java?ds=sidebyside diff --git a/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java new file mode 100644 index 0000000..95dd380 --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java @@ -0,0 +1,1058 @@ +package org.apache.lucene.analysis.shingle; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.NoSuchElementException; +import java.util.Set; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream; +import org.apache.lucene.analysis.payloads.PayloadHelper; +import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix.Column.Row; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.index.Payload; + + +/** + *

A ShingleMatrixFilter constructs shingles (token n-grams) from a token stream. + * In other words, it creates combinations of tokens as a single token. + * + *

For example, the sentence "please divide this sentence into shingles" + * might be tokenized into shingles "please divide", "divide this", + * "this sentence", "sentence into", and "into shingles". + * + *

Using a shingle filter at index and query time can in some instances + * be used to replace phrase queries, especially them with 0 slop. + * + *

Without a spacer character + * it can be used to handle composition and decomposition of words + * such as searching for "multi dimensional" instead of "multidimensional". + * It is a rather common human problem at query time + * in several languages, notably the northern Germanic branch. + * + *

Shingles are amongst many things also known to solve problems + * in spell checking, language detection and document clustering. + * + *

This filter is backed by a three dimensional column oriented matrix + * used to create permutations of the second dimension, the rows, + * and leaves the third, the z-axis, for for multi token synonyms. + * + *

In order to use this filter you need to define a way of positioning + * the input stream tokens in the matrix. This is done using a + * {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenSettingsCodec}. + * There are three simple implementations for demonstrational purposes, + * see {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec}, + * {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec} + * and {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec}. + * + *

Consider this token matrix: + *

+ *  Token[column][row][z-axis]{
+ *    {{hello}, {greetings, and, salutations}},
+ *    {{world}, {earth}, {tellus}}
+ *  };
+ * 
+ * + * It would produce the following 2-3 gram sized shingles: + * + *
+ * "hello_world"
+ * "greetings_and"
+ * "greetings_and_salutations"
+ * "and_salutations"
+ * "and_salutations_world"
+ * "salutations_world"
+ * "hello_earth"
+ * "and_salutations_earth"
+ * "salutations_earth"
+ * "hello_tellus"
+ * "and_salutations_tellus"
+ * "salutations_tellus"
+ *  
+ * + *

This implementation can be rather heap demanding + * if (maximum shingle size - minimum shingle size) is a great number and the stream contains many columns, + * or if each column contains a great number of rows. + * + *

The problem is that in order avoid producing duplicates + * the filter needs to keep track of any shingle already produced and returned to the consumer. + * + * There is a bit of resource management to handle this + * but it would of course be much better if the filter was written + * so it never created the same shingle more than once in the first place. + * + *

The filter also has basic support for calculating weights for the shingles + * based on the weights of the tokens from the input stream, output shingle size, etc. + * See {@link #calculateShingleWeight(org.apache.lucene.analysis.Token, java.util.List, int, java.util.List, java.util.List)}. + *

+ * @deprecated Will be removed in Lucene 4.0. This filter is unmaintained and might not behave + * correctly if used with custom Attributes, i.e. Attributes other than + * the ones located in org.apache.lucene.analysis.tokenattributes. It also uses + * hardcoded payload encoders which makes it not easily adaptable to other use-cases. + */ +@Deprecated +public final class ShingleMatrixFilter extends TokenStream { + + public static Character defaultSpacerCharacter = Character.valueOf('_'); + public static TokenSettingsCodec defaultSettingsCodec = new OneDimensionalNonWeightedTokenSettingsCodec(); + public static boolean ignoringSinglePrefixOrSuffixShingleByDefault = false; + + /** + * Strategy used to code and decode meta data of the tokens from the input stream + * regarding how to position the tokens in the matrix, set and retreive weight, et c. + */ + public static abstract class TokenSettingsCodec { + + /** + * Retrieves information on how a {@link org.apache.lucene.analysis.Token} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}. + * @param token + * @return {@link ShingleMatrixFilter.TokenPositioner} + * @throws IOException + */ + public abstract TokenPositioner getTokenPositioner(Token token) throws IOException; + + /** + * Sets information on how a {@link org.apache.lucene.analysis.Token} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}. + * + * @param token + * @param tokenPositioner + */ + public abstract void setTokenPositioner(Token token, ShingleMatrixFilter.TokenPositioner tokenPositioner); + + /** + * Have this method return 1f in order to 'disable' weights. + * @param token + * @return the weight of parameter token + */ + public abstract float getWeight(Token token); + + /** + * Have this method do nothing in order to 'disable' weights. + * @param token + * @param weight + */ + public abstract void setWeight(Token token, float weight); + } + + + /** + * Used to describe how a {@link org.apache.lucene.analysis.Token} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}. + * @see org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenSettingsCodec#getTokenPositioner(org.apache.lucene.analysis.Token) + * @see org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenSettingsCodec#setTokenPositioner(org.apache.lucene.analysis.Token,org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenPositioner) + */ + public static class TokenPositioner { + public static final TokenPositioner newColumn = new TokenPositioner(0); + public static final TokenPositioner newRow = new TokenPositioner(1); + public static final TokenPositioner sameRow = new TokenPositioner(2); + + private final int index; + + private TokenPositioner(int index) { + this.index = index; + } + + public int getIndex() { + return index; + } + } + + // filter instance settings variables + + private TokenSettingsCodec settingsCodec; + + private int minimumShingleSize; + private int maximumShingleSize; + + private boolean ignoringSinglePrefixOrSuffixShingle = false; + + private Character spacerCharacter = defaultSpacerCharacter; + + private TokenStream input; + + private CharTermAttribute termAtt; + private PositionIncrementAttribute posIncrAtt; + private PayloadAttribute payloadAtt; + private OffsetAttribute offsetAtt; + private TypeAttribute typeAtt; + private FlagsAttribute flagsAtt; + + private CharTermAttribute in_termAtt; + private PositionIncrementAttribute in_posIncrAtt; + private PayloadAttribute in_payloadAtt; + private OffsetAttribute in_offsetAtt; + private TypeAttribute in_typeAtt; + private FlagsAttribute in_flagsAtt; + + + /** + * Creates a shingle filter based on a user defined matrix. + * + * The filter /will/ delete columns from the input matrix! You will not be able to reset the filter if you used this constructor. + * todo: don't touch the matrix! use a boolean, set the input stream to null or something, and keep track of where in the matrix we are at. + * + * @param matrix the input based for creating shingles. Does not need to contain any information until {@link #incrementToken()} is called the first time. + * @param minimumShingleSize minimum number of tokens in any shingle. + * @param maximumShingleSize maximum number of tokens in any shingle. + * @param spacerCharacter character to use between texts of the token parts in a shingle. null for none. + * @param ignoringSinglePrefixOrSuffixShingle if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'. + * @param settingsCodec codec used to read input token weight and matrix positioning. + */ + public ShingleMatrixFilter(Matrix matrix, int minimumShingleSize, int maximumShingleSize, Character spacerCharacter, boolean ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec) { + this.matrix = matrix; + this.minimumShingleSize = minimumShingleSize; + this.maximumShingleSize = maximumShingleSize; + this.spacerCharacter = spacerCharacter; + this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle; + this.settingsCodec = settingsCodec; + + termAtt = addAttribute(CharTermAttribute.class); + posIncrAtt = addAttribute(PositionIncrementAttribute.class); + payloadAtt = addAttribute(PayloadAttribute.class); + offsetAtt = addAttribute(OffsetAttribute.class); + typeAtt = addAttribute(TypeAttribute.class); + flagsAtt = addAttribute(FlagsAttribute.class); + + // set the input to be an empty token stream, we already have the data. + this.input = new EmptyTokenStream(); + + in_termAtt = input.addAttribute(CharTermAttribute.class); + in_posIncrAtt = input.addAttribute(PositionIncrementAttribute.class); + in_payloadAtt = input.addAttribute(PayloadAttribute.class); + in_offsetAtt = input.addAttribute(OffsetAttribute.class); + in_typeAtt = input.addAttribute(TypeAttribute.class); + in_flagsAtt = input.addAttribute(FlagsAttribute.class); + } + + /** + * Creates a shingle filter using default settings. + * + * @see #defaultSpacerCharacter + * @see #ignoringSinglePrefixOrSuffixShingleByDefault + * @see #defaultSettingsCodec + * + * @param input stream from which to construct the matrix + * @param minimumShingleSize minimum number of tokens in any shingle. + * @param maximumShingleSize maximum number of tokens in any shingle. + */ + public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize) { + this(input, minimumShingleSize, maximumShingleSize, defaultSpacerCharacter); + } + + + /** + * Creates a shingle filter using default settings. + * + * @see #ignoringSinglePrefixOrSuffixShingleByDefault + * @see #defaultSettingsCodec + * + * @param input stream from which to construct the matrix + * @param minimumShingleSize minimum number of tokens in any shingle. + * @param maximumShingleSize maximum number of tokens in any shingle. + * @param spacerCharacter character to use between texts of the token parts in a shingle. null for none. + */ + public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Character spacerCharacter) { + this(input, minimumShingleSize, maximumShingleSize, spacerCharacter, ignoringSinglePrefixOrSuffixShingleByDefault); + } + + /** + * Creates a shingle filter using the default {@link TokenSettingsCodec}. + * + * @see #defaultSettingsCodec + * + * @param input stream from which to construct the matrix + * @param minimumShingleSize minimum number of tokens in any shingle. + * @param maximumShingleSize maximum number of tokens in any shingle. + * @param spacerCharacter character to use between texts of the token parts in a shingle. null for none. + * @param ignoringSinglePrefixOrSuffixShingle if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'. + */ + public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Character spacerCharacter, boolean ignoringSinglePrefixOrSuffixShingle) { + this(input, minimumShingleSize, maximumShingleSize, spacerCharacter, ignoringSinglePrefixOrSuffixShingle, defaultSettingsCodec); + } + + + /** + * Creates a shingle filter with ad hoc parameter settings. + * + * @param input stream from which to construct the matrix + * @param minimumShingleSize minimum number of tokens in any shingle. + * @param maximumShingleSize maximum number of tokens in any shingle. + * @param spacerCharacter character to use between texts of the token parts in a shingle. null for none. + * @param ignoringSinglePrefixOrSuffixShingle if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'. + * @param settingsCodec codec used to read input token weight and matrix positioning. + */ + public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Character spacerCharacter, boolean ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec) { + this.input = input; + this.minimumShingleSize = minimumShingleSize; + this.maximumShingleSize = maximumShingleSize; + this.spacerCharacter = spacerCharacter; + this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle; + this.settingsCodec = settingsCodec; + termAtt = addAttribute(CharTermAttribute.class); + posIncrAtt = addAttribute(PositionIncrementAttribute.class); + payloadAtt = addAttribute(PayloadAttribute.class); + offsetAtt = addAttribute(OffsetAttribute.class); + typeAtt = addAttribute(TypeAttribute.class); + flagsAtt = addAttribute(FlagsAttribute.class); + + in_termAtt = input.addAttribute(CharTermAttribute.class); + in_posIncrAtt = input.addAttribute(PositionIncrementAttribute.class); + in_payloadAtt = input.addAttribute(PayloadAttribute.class); + in_offsetAtt = input.addAttribute(OffsetAttribute.class); + in_typeAtt = input.addAttribute(TypeAttribute.class); + in_flagsAtt = input.addAttribute(FlagsAttribute.class); + } + + // internal filter instance variables + + /** iterator over the current matrix row permutations */ + private Iterator permutations; + + /** the current permutation of tokens used to produce shingles */ + private List currentPermuationTokens; + /** index to what row a token in currentShingleTokens represents*/ + private List currentPermutationRows; + + private int currentPermutationTokensStartOffset; + private int currentShingleLength; + + /** + * a set containing shingles that has been the result of a call to {@link #incrementToken()}, + * used to avoid producing the same shingle more than once. + */ + private Set> shinglesSeen = new HashSet>(); + + + @Override + public void reset() throws IOException { + permutations = null; + shinglesSeen.clear(); + input.reset(); + } + + private Matrix matrix; + + private Token reusableToken = new Token(); + + @Override + public final boolean incrementToken() throws IOException { + if (matrix == null) { + matrix = new Matrix(); + // fill matrix with maximumShingleSize columns + while (matrix.columns.size() < maximumShingleSize && readColumn()) { + // this loop looks ugly + } + } + + // this loop exists in order to avoid recursive calls to the next method + // as the complexity of a large matrix + // then would require a multi gigabyte sized stack. + Token token; + do { + token = produceNextToken(reusableToken); + } while (token == request_next_token); + if (token == null) return false; + + clearAttributes(); + termAtt.copyBuffer(token.buffer(), 0, token.length()); + posIncrAtt.setPositionIncrement(token.getPositionIncrement()); + flagsAtt.setFlags(token.getFlags()); + offsetAtt.setOffset(token.startOffset(), token.endOffset()); + typeAtt.setType(token.type()); + payloadAtt.setPayload(token.getPayload()); + return true; + } + + private Token getNextInputToken(Token token) throws IOException { + if (!input.incrementToken()) return null; + token.copyBuffer(in_termAtt.buffer(), 0, in_termAtt.length()); + token.setPositionIncrement(in_posIncrAtt.getPositionIncrement()); + token.setFlags(in_flagsAtt.getFlags()); + token.setOffset(in_offsetAtt.startOffset(), in_offsetAtt.endOffset()); + token.setType(in_typeAtt.type()); + token.setPayload(in_payloadAtt.getPayload()); + return token; + } + + private Token getNextToken(Token token) throws IOException { + if (!this.incrementToken()) return null; + token.copyBuffer(termAtt.buffer(), 0, termAtt.length()); + token.setPositionIncrement(posIncrAtt.getPositionIncrement()); + token.setFlags(flagsAtt.getFlags()); + token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); + token.setType(typeAtt.type()); + token.setPayload(payloadAtt.getPayload()); + return token; + } + + private static final Token request_next_token = new Token(); + + /** + * This method exists in order to avoid recursive calls to the method + * as the complexity of a fairly small matrix then easily would require + * a gigabyte sized stack per thread. + * + * @param reusableToken + * @return null if exhausted, instance request_next_token if one more call is required for an answer, or instance parameter resuableToken. + * @throws IOException + */ + private Token produceNextToken(final Token reusableToken) throws IOException { + + if (currentPermuationTokens != null) { + currentShingleLength++; + + if (currentShingleLength + currentPermutationTokensStartOffset <= currentPermuationTokens.size() + && currentShingleLength <= maximumShingleSize) { + + // it is possible to create at least one more shingle of the current matrix permutation + + if (ignoringSinglePrefixOrSuffixShingle + && currentShingleLength == 1 + && ((currentPermutationRows.get(currentPermutationTokensStartOffset)).getColumn().isFirst() || (currentPermutationRows.get(currentPermutationTokensStartOffset)).getColumn().isLast())) { + return getNextToken(reusableToken); + } + + int termLength = 0; + + List shingle = new ArrayList(currentShingleLength); + + for (int i = 0; i < currentShingleLength; i++) { + Token shingleToken = currentPermuationTokens.get(i + currentPermutationTokensStartOffset); + termLength += shingleToken.length(); + shingle.add(shingleToken); + } + if (spacerCharacter != null) { + termLength += currentShingleLength - 1; + } + + // only produce shingles that not already has been created + if (!shinglesSeen.add(shingle)) { + return request_next_token; + } + + // shingle token factory + StringBuilder sb = new StringBuilder(termLength + 10); // paranormal ability to foresee the future. + for (Token shingleToken : shingle) { + if (spacerCharacter != null && sb.length() > 0) { + sb.append(spacerCharacter); + } + sb.append(shingleToken.buffer(), 0, shingleToken.length()); + } + reusableToken.setEmpty().append(sb); + updateToken(reusableToken, shingle, currentPermutationTokensStartOffset, currentPermutationRows, currentPermuationTokens); + + return reusableToken; + + } else { + + // it is NOT possible to create one more shingles of the current matrix permutation + + if (currentPermutationTokensStartOffset < currentPermuationTokens.size() - 1) { + // reset shingle size and move one step to the right in the current tokens permutation + currentPermutationTokensStartOffset++; + currentShingleLength = minimumShingleSize - 1; + return request_next_token; + } + + + if (permutations == null) { + // todo does this ever occur? + return null; + } + + + if (!permutations.hasNext()) { + + // load more data (if available) to the matrix + + if (input != null && readColumn()) { + // don't really care, we just read it. + } + + // get rid of resources + + // delete the first column in the matrix + Matrix.Column deletedColumn = matrix.columns.remove(0); + + // remove all shingles seen that include any of the tokens from the deleted column. + List deletedColumnTokens = new ArrayList(); + for (Matrix.Column.Row row : deletedColumn.getRows()) { + for (Token token : row.getTokens()) { + deletedColumnTokens.add(token); + } + + } + for (Iterator> shinglesSeenIterator = shinglesSeen.iterator(); shinglesSeenIterator.hasNext();) { + List shingle = shinglesSeenIterator.next(); + for (Token deletedColumnToken : deletedColumnTokens) { + if (shingle.contains(deletedColumnToken)) { + shinglesSeenIterator.remove(); + break; + } + } + } + + + if (matrix.columns.size() < minimumShingleSize) { + // exhausted + return null; + } + + // create permutations of the matrix it now looks + permutations = matrix.permutationIterator(); + } + + nextTokensPermutation(); + return request_next_token; + + } + } + + if (permutations == null) { + permutations = matrix.permutationIterator(); + } + + if (!permutations.hasNext()) { + return null; + } + + nextTokensPermutation(); + + return request_next_token; + } + + /** + * get next permutation of row combinations, + * creates list of all tokens in the row and + * an index from each such token to what row they exist in. + * finally resets the current (next) shingle size and offset. + */ + private void nextTokensPermutation() { + Matrix.Column.Row[] rowsPermutation = permutations.next(); + List currentPermutationRows = new ArrayList(); + List currentPermuationTokens = new ArrayList(); + for (Matrix.Column.Row row : rowsPermutation) { + for (Token token : row.getTokens()) { + currentPermuationTokens.add(token); + currentPermutationRows.add(row); + } + } + this.currentPermuationTokens = currentPermuationTokens; + this.currentPermutationRows = currentPermutationRows; + + currentPermutationTokensStartOffset = 0; + currentShingleLength = minimumShingleSize - 1; + + } + + /** + * Final touch of a shingle token before it is passed on to the consumer from method {@link #incrementToken()}. + * + * Calculates and sets type, flags, position increment, start/end offsets and weight. + * + * @param token Shingle token + * @param shingle Tokens used to produce the shingle token. + * @param currentPermutationStartOffset Start offset in parameter currentPermutationTokens + * @param currentPermutationRows index to Matrix.Column.Row from the position of tokens in parameter currentPermutationTokens + * @param currentPermuationTokens tokens of the current permutation of rows in the matrix. + */ + public void updateToken(Token token, List shingle, int currentPermutationStartOffset, List currentPermutationRows, List currentPermuationTokens) { + token.setType(ShingleMatrixFilter.class.getName()); + token.setFlags(0); + token.setPositionIncrement(1); + token.setStartOffset(shingle.get(0).startOffset()); + token.setEndOffset(shingle.get(shingle.size() - 1).endOffset()); + settingsCodec.setWeight(token, calculateShingleWeight(token, shingle, currentPermutationStartOffset, currentPermutationRows, currentPermuationTokens)); + } + + /** + * Evaluates the new shingle token weight. + * + * for (shingle part token in shingle) + * weight += shingle part token weight * (1 / sqrt(all shingle part token weights summed)) + * + * This algorithm gives a slightly greater score for longer shingles + * and is rather penalising to great shingle token part weights. + * + * @param shingleToken token returned to consumer + * @param shingle tokens the tokens used to produce the shingle token. + * @param currentPermutationStartOffset start offset in parameter currentPermutationRows and currentPermutationTokens. + * @param currentPermutationRows an index to what matrix row a token in parameter currentPermutationTokens exist. + * @param currentPermuationTokens all tokens in the current row permutation of the matrix. A sub list (parameter offset, parameter shingle.size) equals parameter shingle. + * @return weight to be set for parameter shingleToken + */ + public float calculateShingleWeight(Token shingleToken, List shingle, int currentPermutationStartOffset, List currentPermutationRows, List currentPermuationTokens) { + double[] weights = new double[shingle.size()]; + + double total = 0f; + double top = 0d; + + + for (int i=0; i top) { + top = tmp; + } + total += tmp; + } + + double factor = 1d / Math.sqrt(total); + + double weight = 0d; + for (double partWeight : weights) { + weight += partWeight * factor; + } + + return (float) weight; + } + + + private Token readColumnBuf; + + /** + * Loads one column from the token stream. + * + * When the last token is read from the token stream it will column.setLast(true); + * + * @return true if it manage to read one more column from the input token stream + * @throws IOException if the matrix source input stream throws an exception + */ + private boolean readColumn() throws IOException { + + Token token; + if (readColumnBuf != null) { + token = readColumnBuf; + readColumnBuf = null; + } else { + token = getNextInputToken(new Token()); + } + + if (token == null) { + return false; + } + + Matrix.Column currentReaderColumn = matrix.new Column(); + Matrix.Column.Row currentReaderRow = currentReaderColumn.new Row(); + + currentReaderRow.getTokens().add(token); + TokenPositioner tokenPositioner; + while ((readColumnBuf = getNextInputToken(new Token())) != null + && (tokenPositioner = settingsCodec.getTokenPositioner(readColumnBuf)) != TokenPositioner.newColumn) { + + if (tokenPositioner == TokenPositioner.sameRow) { + currentReaderRow.getTokens().add(readColumnBuf); + } else /*if (tokenPositioner == TokenPositioner.newRow)*/ { + currentReaderRow = currentReaderColumn.new Row(); + currentReaderRow.getTokens().add(readColumnBuf); + } + readColumnBuf = null; + + } + + if (readColumnBuf == null) { + readColumnBuf = getNextInputToken(new Token()); + if (readColumnBuf == null) { + currentReaderColumn.setLast(true); + } + } + + + return true; + + } + + + /** + * A column focused matrix in three dimensions: + * + *

+   * Token[column][row][z-axis] {
+   *     {{hello}, {greetings, and, salutations}},
+   *     {{world}, {earth}, {tellus}}
+   * };
+   * 
+ * + * todo consider row groups + * to indicate that shingles is only to contain permutations with texts in that same row group. + * + */ + public static class Matrix { + + private boolean columnsHasBeenCreated = false; + + private List columns = new ArrayList(); + + public List getColumns() { + return columns; + } + + public class Column { + + private boolean last; + private boolean first; + + public Matrix getMatrix() { + return Matrix.this; + } + + public Column(Token token) { + this(); + Row row = new Row(); + row.getTokens().add(token); + } + + public Column() { + synchronized (Matrix.this) { + if (!columnsHasBeenCreated) { + this.setFirst(true); + columnsHasBeenCreated = true; + } + } + Matrix.this.columns.add(this); + } + + private List rows = new ArrayList(); + + public List getRows() { + return rows; + } + + + public int getIndex() { + return Matrix.this.columns.indexOf(this); + } + + @Override + public String toString() { + return "Column{" + + "first=" + first + + ", last=" + last + + ", rows=" + rows + + '}'; + } + + public boolean isFirst() { + return first; + } + + public void setFirst(boolean first) { + this.first = first; + } + + public void setLast(boolean last) { + this.last = last; + } + + public boolean isLast() { + return last; + } + + public class Row { + + public Column getColumn() { + return Column.this; + } + + private List tokens = new LinkedList(); + + public Row() { + Column.this.rows.add(this); + } + + public int getIndex() { + return Column.this.rows.indexOf(this); + } + + public List getTokens() { + return tokens; + } + + public void setTokens(List tokens) { + this.tokens = tokens; + } + +// public int getStartOffset() { +// int ret = tokens[0].startOffset(); +// if (getIndex() > 0 && ret == 0) { +// ret = Column.this.rows.get(0).getStartOffset(); +// } +// return ret; +// } +// +// public int getEndOffset() { +// int ret = tokens[tokens.length - 1].endOffset(); +// if (getIndex() > 0 && ret == 0) { +// ret = Column.this.rows.get(0).getEndOffset(); +// } +// return ret; +// } + + @Override + public String toString() { + return "Row{" + + "index=" + getIndex() + + ", tokens=" + (tokens == null ? null : tokens) + + '}'; + } + } + + } + + + public Iterator permutationIterator() { + + return new Iterator() { + + private int[] columnRowCounters = new int[columns.size()]; + + public void remove() { + throw new IllegalStateException("not implemented"); + } + + public boolean hasNext() { + int s = columnRowCounters.length; + int n = columns.size(); + return s != 0 && n >= s && columnRowCounters[s - 1] < (columns.get(s - 1)).getRows().size(); + } + + public Column.Row[] next() { + if (!hasNext()) { + throw new NoSuchElementException("no more elements"); + } + + Column.Row[] rows = new Column.Row[columnRowCounters.length]; + + for (int i = 0; i < columnRowCounters.length; i++) { + rows[i] = columns.get(i).rows.get(columnRowCounters[i]); + } + incrementColumnRowCounters(); + + return rows; + } + + private void incrementColumnRowCounters() { + for (int i = 0; i < columnRowCounters.length; i++) { + columnRowCounters[i]++; + if (columnRowCounters[i] == columns.get(i).rows.size() && + i < columnRowCounters.length - 1) { + columnRowCounters[i] = 0; + } else { + break; + } + } + } + }; + } + + @Override + public String toString() { + return "Matrix{" + + "columns=" + columns + + '}'; + } + } + + + public int getMinimumShingleSize() { + return minimumShingleSize; + } + + public void setMinimumShingleSize(int minimumShingleSize) { + this.minimumShingleSize = minimumShingleSize; + } + + public int getMaximumShingleSize() { + return maximumShingleSize; + } + + public void setMaximumShingleSize(int maximumShingleSize) { + this.maximumShingleSize = maximumShingleSize; + } + + + public Matrix getMatrix() { + return matrix; + } + + public void setMatrix(Matrix matrix) { + this.matrix = matrix; + } + + public Character getSpacerCharacter() { + return spacerCharacter; + } + + public void setSpacerCharacter(Character spacerCharacter) { + this.spacerCharacter = spacerCharacter; + } + + public boolean isIgnoringSinglePrefixOrSuffixShingle() { + return ignoringSinglePrefixOrSuffixShingle; + } + + public void setIgnoringSinglePrefixOrSuffixShingle(boolean ignoringSinglePrefixOrSuffixShingle) { + this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle; + } + + /** + * Using this codec makes a {@link ShingleMatrixFilter} act like {@link org.apache.lucene.analysis.shingle.ShingleFilter}. + * It produces the most simple sort of shingles, ignoring token position increments, et c. + * + * It adds each token as a new column. + */ + public static class OneDimensionalNonWeightedTokenSettingsCodec extends TokenSettingsCodec { + + @Override + public TokenPositioner getTokenPositioner(Token token) throws IOException { + return TokenPositioner.newColumn; + } + + @Override + public void setTokenPositioner(Token token, TokenPositioner tokenPositioner) { + } + + @Override + public float getWeight(Token token) { + return 1f; + } + + @Override + public void setWeight(Token token, float weight) { + } + + } + + + /** + * A codec that creates a two dimensional matrix + * by treating tokens from the input stream with 0 position increment + * as new rows to the current column. + */ + public static class TwoDimensionalNonWeightedSynonymTokenSettingsCodec extends TokenSettingsCodec { + + @Override + public TokenPositioner getTokenPositioner(Token token) throws IOException { + if (token.getPositionIncrement() == 0) { + return TokenPositioner.newRow; + } else { + return TokenPositioner.newColumn; + } + } + + @Override + public void setTokenPositioner(Token token, TokenPositioner tokenPositioner) { + throw new UnsupportedOperationException(); + } + + @Override + public float getWeight(Token token) { + return 1f; + } + + @Override + public void setWeight(Token token, float weight) { + } + + } + + /** + * A full featured codec not to be used for something serious. + * + * It takes complete control of + * payload for weight + * and the bit flags for positioning in the matrix. + * + * Mainly exist for demonstrational purposes. + */ + public static class SimpleThreeDimensionalTokenSettingsCodec extends TokenSettingsCodec { + + /** + * @param token + * @return the token flags int value as TokenPosition + * @throws IOException + */ + @Override + public TokenPositioner getTokenPositioner(Token token) throws IOException { + switch (token.getFlags()) { + case 0: + return TokenPositioner.newColumn; + case 1: + return TokenPositioner.newRow; + case 2: + return TokenPositioner.sameRow; + } + throw new IOException("Unknown matrix positioning of token " + token); + } + + /** + * Sets the TokenPositioner as token flags int value. + * + * @param token + * @param tokenPositioner + */ + @Override + public void setTokenPositioner(Token token, TokenPositioner tokenPositioner) { + token.setFlags(tokenPositioner.getIndex()); + } + + /** + * Returns a 32 bit float from the payload, or 1f it null. + * + * @param token + * @return 32 bit float + */ + @Override + public float getWeight(Token token) { + if (token.getPayload() == null || token.getPayload().getData() == null) { + return 1f; + } else { + return PayloadHelper.decodeFloat(token.getPayload().getData()); + } + } + + /** + * Stores a 32 bit float in the payload, or set it to null if 1f; + * @param token + * @param weight + */ + @Override + public void setWeight(Token token, float weight) { + if (weight == 1f) { + token.setPayload(null); + } else { + token.setPayload(new Payload(PayloadHelper.encodeFloat(weight))); + } + } + + } + + +}