1 package org.apache.lucene.analysis.shingle;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.util.ArrayList;
22 import java.util.HashSet;
23 import java.util.Iterator;
24 import java.util.LinkedList;
25 import java.util.List;
26 import java.util.NoSuchElementException;
29 import org.apache.lucene.analysis.Token;
30 import org.apache.lucene.analysis.TokenStream;
31 import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
32 import org.apache.lucene.analysis.payloads.PayloadHelper;
33 import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix.Column.Row;
34 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
35 import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
36 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
37 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
38 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
39 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
40 import org.apache.lucene.index.Payload;
44 * <p>A ShingleMatrixFilter constructs shingles (token n-grams) from a token stream.
45 * In other words, it creates combinations of tokens as a single token.
47 * <p>For example, the sentence "please divide this sentence into shingles"
48 * might be tokenized into shingles "please divide", "divide this",
49 * "this sentence", "sentence into", and "into shingles".
51 * <p>Using a shingle filter at index and query time can in some instances
52 * be used to replace phrase queries, especially them with 0 slop.
54 * <p>Without a spacer character
55 * it can be used to handle composition and decomposition of words
56 * such as searching for "multi dimensional" instead of "multidimensional".
57 * It is a rather common human problem at query time
58 * in several languages, notably the northern Germanic branch.
60 * <p>Shingles are amongst many things also known to solve problems
61 * in spell checking, language detection and document clustering.
63 * <p>This filter is backed by a three dimensional column oriented matrix
64 * used to create permutations of the second dimension, the rows,
65 * and leaves the third, the z-axis, for for multi token synonyms.
67 * <p>In order to use this filter you need to define a way of positioning
68 * the input stream tokens in the matrix. This is done using a
69 * {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenSettingsCodec}.
70 * There are three simple implementations for demonstrational purposes,
71 * see {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec},
72 * {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec}
73 * and {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec}.
75 * <p>Consider this token matrix:
77 * Token[column][row][z-axis]{
78 * {{hello}, {greetings, and, salutations}},
79 * {{world}, {earth}, {tellus}}
83 * It would produce the following 2-3 gram sized shingles:
88 * "greetings_and_salutations"
90 * "and_salutations_world"
93 * "and_salutations_earth"
96 * "and_salutations_tellus"
97 * "salutations_tellus"
100 * <p>This implementation can be rather heap demanding
101 * if (maximum shingle size - minimum shingle size) is a great number and the stream contains many columns,
102 * or if each column contains a great number of rows.
104 * <p>The problem is that in order avoid producing duplicates
105 * the filter needs to keep track of any shingle already produced and returned to the consumer.
107 * There is a bit of resource management to handle this
108 * but it would of course be much better if the filter was written
109 * so it never created the same shingle more than once in the first place.
111 * <p>The filter also has basic support for calculating weights for the shingles
112 * based on the weights of the tokens from the input stream, output shingle size, etc.
113 * See {@link #calculateShingleWeight(org.apache.lucene.analysis.Token, java.util.List, int, java.util.List, java.util.List)}.
115 * @deprecated Will be removed in Lucene 4.0. This filter is unmaintained and might not behave
116 * correctly if used with custom Attributes, i.e. Attributes other than
117 * the ones located in <code>org.apache.lucene.analysis.tokenattributes</code>. It also uses
118 * hardcoded payload encoders which makes it not easily adaptable to other use-cases.
121 public final class ShingleMatrixFilter extends TokenStream {
123 public static Character defaultSpacerCharacter = Character.valueOf('_');
124 public static TokenSettingsCodec defaultSettingsCodec = new OneDimensionalNonWeightedTokenSettingsCodec();
125 public static boolean ignoringSinglePrefixOrSuffixShingleByDefault = false;
128 * Strategy used to code and decode meta data of the tokens from the input stream
129 * regarding how to position the tokens in the matrix, set and retreive weight, et c.
131 public static abstract class TokenSettingsCodec {
134 * Retrieves information on how a {@link org.apache.lucene.analysis.Token} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}.
136 * @return {@link ShingleMatrixFilter.TokenPositioner}
137 * @throws IOException
139 public abstract TokenPositioner getTokenPositioner(Token token) throws IOException;
142 * Sets information on how a {@link org.apache.lucene.analysis.Token} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}.
145 * @param tokenPositioner
147 public abstract void setTokenPositioner(Token token, ShingleMatrixFilter.TokenPositioner tokenPositioner);
150 * Have this method return 1f in order to 'disable' weights.
152 * @return the weight of parameter token
154 public abstract float getWeight(Token token);
157 * Have this method do nothing in order to 'disable' weights.
161 public abstract void setWeight(Token token, float weight);
166 * Used to describe how a {@link org.apache.lucene.analysis.Token} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}.
167 * @see org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenSettingsCodec#getTokenPositioner(org.apache.lucene.analysis.Token)
168 * @see org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenSettingsCodec#setTokenPositioner(org.apache.lucene.analysis.Token,org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenPositioner)
170 public static class TokenPositioner {
171 public static final TokenPositioner newColumn = new TokenPositioner(0);
172 public static final TokenPositioner newRow = new TokenPositioner(1);
173 public static final TokenPositioner sameRow = new TokenPositioner(2);
175 private final int index;
177 private TokenPositioner(int index) {
181 public int getIndex() {
186 // filter instance settings variables
188 private TokenSettingsCodec settingsCodec;
190 private int minimumShingleSize;
191 private int maximumShingleSize;
193 private boolean ignoringSinglePrefixOrSuffixShingle = false;
195 private Character spacerCharacter = defaultSpacerCharacter;
197 private TokenStream input;
199 private CharTermAttribute termAtt;
200 private PositionIncrementAttribute posIncrAtt;
201 private PayloadAttribute payloadAtt;
202 private OffsetAttribute offsetAtt;
203 private TypeAttribute typeAtt;
204 private FlagsAttribute flagsAtt;
206 private CharTermAttribute in_termAtt;
207 private PositionIncrementAttribute in_posIncrAtt;
208 private PayloadAttribute in_payloadAtt;
209 private OffsetAttribute in_offsetAtt;
210 private TypeAttribute in_typeAtt;
211 private FlagsAttribute in_flagsAtt;
215 * Creates a shingle filter based on a user defined matrix.
217 * The filter /will/ delete columns from the input matrix! You will not be able to reset the filter if you used this constructor.
218 * todo: don't touch the matrix! use a boolean, set the input stream to null or something, and keep track of where in the matrix we are at.
220 * @param matrix the input based for creating shingles. Does not need to contain any information until {@link #incrementToken()} is called the first time.
221 * @param minimumShingleSize minimum number of tokens in any shingle.
222 * @param maximumShingleSize maximum number of tokens in any shingle.
223 * @param spacerCharacter character to use between texts of the token parts in a shingle. null for none.
224 * @param ignoringSinglePrefixOrSuffixShingle if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.
225 * @param settingsCodec codec used to read input token weight and matrix positioning.
227 public ShingleMatrixFilter(Matrix matrix, int minimumShingleSize, int maximumShingleSize, Character spacerCharacter, boolean ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec) {
228 this.matrix = matrix;
229 this.minimumShingleSize = minimumShingleSize;
230 this.maximumShingleSize = maximumShingleSize;
231 this.spacerCharacter = spacerCharacter;
232 this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
233 this.settingsCodec = settingsCodec;
235 termAtt = addAttribute(CharTermAttribute.class);
236 posIncrAtt = addAttribute(PositionIncrementAttribute.class);
237 payloadAtt = addAttribute(PayloadAttribute.class);
238 offsetAtt = addAttribute(OffsetAttribute.class);
239 typeAtt = addAttribute(TypeAttribute.class);
240 flagsAtt = addAttribute(FlagsAttribute.class);
242 // set the input to be an empty token stream, we already have the data.
243 this.input = new EmptyTokenStream();
245 in_termAtt = input.addAttribute(CharTermAttribute.class);
246 in_posIncrAtt = input.addAttribute(PositionIncrementAttribute.class);
247 in_payloadAtt = input.addAttribute(PayloadAttribute.class);
248 in_offsetAtt = input.addAttribute(OffsetAttribute.class);
249 in_typeAtt = input.addAttribute(TypeAttribute.class);
250 in_flagsAtt = input.addAttribute(FlagsAttribute.class);
254 * Creates a shingle filter using default settings.
256 * @see #defaultSpacerCharacter
257 * @see #ignoringSinglePrefixOrSuffixShingleByDefault
258 * @see #defaultSettingsCodec
260 * @param input stream from which to construct the matrix
261 * @param minimumShingleSize minimum number of tokens in any shingle.
262 * @param maximumShingleSize maximum number of tokens in any shingle.
264 public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize) {
265 this(input, minimumShingleSize, maximumShingleSize, defaultSpacerCharacter);
270 * Creates a shingle filter using default settings.
272 * @see #ignoringSinglePrefixOrSuffixShingleByDefault
273 * @see #defaultSettingsCodec
275 * @param input stream from which to construct the matrix
276 * @param minimumShingleSize minimum number of tokens in any shingle.
277 * @param maximumShingleSize maximum number of tokens in any shingle.
278 * @param spacerCharacter character to use between texts of the token parts in a shingle. null for none.
280 public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Character spacerCharacter) {
281 this(input, minimumShingleSize, maximumShingleSize, spacerCharacter, ignoringSinglePrefixOrSuffixShingleByDefault);
285 * Creates a shingle filter using the default {@link TokenSettingsCodec}.
287 * @see #defaultSettingsCodec
289 * @param input stream from which to construct the matrix
290 * @param minimumShingleSize minimum number of tokens in any shingle.
291 * @param maximumShingleSize maximum number of tokens in any shingle.
292 * @param spacerCharacter character to use between texts of the token parts in a shingle. null for none.
293 * @param ignoringSinglePrefixOrSuffixShingle if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.
295 public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Character spacerCharacter, boolean ignoringSinglePrefixOrSuffixShingle) {
296 this(input, minimumShingleSize, maximumShingleSize, spacerCharacter, ignoringSinglePrefixOrSuffixShingle, defaultSettingsCodec);
301 * Creates a shingle filter with ad hoc parameter settings.
303 * @param input stream from which to construct the matrix
304 * @param minimumShingleSize minimum number of tokens in any shingle.
305 * @param maximumShingleSize maximum number of tokens in any shingle.
306 * @param spacerCharacter character to use between texts of the token parts in a shingle. null for none.
307 * @param ignoringSinglePrefixOrSuffixShingle if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.
308 * @param settingsCodec codec used to read input token weight and matrix positioning.
310 public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Character spacerCharacter, boolean ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec) {
312 this.minimumShingleSize = minimumShingleSize;
313 this.maximumShingleSize = maximumShingleSize;
314 this.spacerCharacter = spacerCharacter;
315 this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
316 this.settingsCodec = settingsCodec;
317 termAtt = addAttribute(CharTermAttribute.class);
318 posIncrAtt = addAttribute(PositionIncrementAttribute.class);
319 payloadAtt = addAttribute(PayloadAttribute.class);
320 offsetAtt = addAttribute(OffsetAttribute.class);
321 typeAtt = addAttribute(TypeAttribute.class);
322 flagsAtt = addAttribute(FlagsAttribute.class);
324 in_termAtt = input.addAttribute(CharTermAttribute.class);
325 in_posIncrAtt = input.addAttribute(PositionIncrementAttribute.class);
326 in_payloadAtt = input.addAttribute(PayloadAttribute.class);
327 in_offsetAtt = input.addAttribute(OffsetAttribute.class);
328 in_typeAtt = input.addAttribute(TypeAttribute.class);
329 in_flagsAtt = input.addAttribute(FlagsAttribute.class);
332 // internal filter instance variables
334 /** iterator over the current matrix row permutations */
335 private Iterator<Matrix.Column.Row[]> permutations;
337 /** the current permutation of tokens used to produce shingles */
338 private List<Token> currentPermuationTokens;
339 /** index to what row a token in currentShingleTokens represents*/
340 private List<Matrix.Column.Row> currentPermutationRows;
342 private int currentPermutationTokensStartOffset;
343 private int currentShingleLength;
346 * a set containing shingles that has been the result of a call to {@link #incrementToken()},
347 * used to avoid producing the same shingle more than once.
349 private Set<List<Token>> shinglesSeen = new HashSet<List<Token>>();
353 public void reset() throws IOException {
355 shinglesSeen.clear();
359 private Matrix matrix;
361 private Token reusableToken = new Token();
364 public final boolean incrementToken() throws IOException {
365 if (matrix == null) {
366 matrix = new Matrix();
367 // fill matrix with maximumShingleSize columns
368 while (matrix.columns.size() < maximumShingleSize && readColumn()) {
369 // this loop looks ugly
373 // this loop exists in order to avoid recursive calls to the next method
374 // as the complexity of a large matrix
375 // then would require a multi gigabyte sized stack.
378 token = produceNextToken(reusableToken);
379 } while (token == request_next_token);
380 if (token == null) return false;
383 termAtt.copyBuffer(token.buffer(), 0, token.length());
384 posIncrAtt.setPositionIncrement(token.getPositionIncrement());
385 flagsAtt.setFlags(token.getFlags());
386 offsetAtt.setOffset(token.startOffset(), token.endOffset());
387 typeAtt.setType(token.type());
388 payloadAtt.setPayload(token.getPayload());
392 private Token getNextInputToken(Token token) throws IOException {
393 if (!input.incrementToken()) return null;
394 token.copyBuffer(in_termAtt.buffer(), 0, in_termAtt.length());
395 token.setPositionIncrement(in_posIncrAtt.getPositionIncrement());
396 token.setFlags(in_flagsAtt.getFlags());
397 token.setOffset(in_offsetAtt.startOffset(), in_offsetAtt.endOffset());
398 token.setType(in_typeAtt.type());
399 token.setPayload(in_payloadAtt.getPayload());
403 private Token getNextToken(Token token) throws IOException {
404 if (!this.incrementToken()) return null;
405 token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
406 token.setPositionIncrement(posIncrAtt.getPositionIncrement());
407 token.setFlags(flagsAtt.getFlags());
408 token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
409 token.setType(typeAtt.type());
410 token.setPayload(payloadAtt.getPayload());
414 private static final Token request_next_token = new Token();
417 * This method exists in order to avoid recursive calls to the method
418 * as the complexity of a fairly small matrix then easily would require
419 * a gigabyte sized stack per thread.
421 * @param reusableToken
422 * @return null if exhausted, instance request_next_token if one more call is required for an answer, or instance parameter resuableToken.
423 * @throws IOException
425 private Token produceNextToken(final Token reusableToken) throws IOException {
427 if (currentPermuationTokens != null) {
428 currentShingleLength++;
430 if (currentShingleLength + currentPermutationTokensStartOffset <= currentPermuationTokens.size()
431 && currentShingleLength <= maximumShingleSize) {
433 // it is possible to create at least one more shingle of the current matrix permutation
435 if (ignoringSinglePrefixOrSuffixShingle
436 && currentShingleLength == 1
437 && ((currentPermutationRows.get(currentPermutationTokensStartOffset)).getColumn().isFirst() || (currentPermutationRows.get(currentPermutationTokensStartOffset)).getColumn().isLast())) {
438 return getNextToken(reusableToken);
443 List<Token> shingle = new ArrayList<Token>(currentShingleLength);
445 for (int i = 0; i < currentShingleLength; i++) {
446 Token shingleToken = currentPermuationTokens.get(i + currentPermutationTokensStartOffset);
447 termLength += shingleToken.length();
448 shingle.add(shingleToken);
450 if (spacerCharacter != null) {
451 termLength += currentShingleLength - 1;
454 // only produce shingles that not already has been created
455 if (!shinglesSeen.add(shingle)) {
456 return request_next_token;
459 // shingle token factory
460 StringBuilder sb = new StringBuilder(termLength + 10); // paranormal ability to foresee the future.
461 for (Token shingleToken : shingle) {
462 if (spacerCharacter != null && sb.length() > 0) {
463 sb.append(spacerCharacter);
465 sb.append(shingleToken.buffer(), 0, shingleToken.length());
467 reusableToken.setEmpty().append(sb);
468 updateToken(reusableToken, shingle, currentPermutationTokensStartOffset, currentPermutationRows, currentPermuationTokens);
470 return reusableToken;
474 // it is NOT possible to create one more shingles of the current matrix permutation
476 if (currentPermutationTokensStartOffset < currentPermuationTokens.size() - 1) {
477 // reset shingle size and move one step to the right in the current tokens permutation
478 currentPermutationTokensStartOffset++;
479 currentShingleLength = minimumShingleSize - 1;
480 return request_next_token;
484 if (permutations == null) {
485 // todo does this ever occur?
490 if (!permutations.hasNext()) {
492 // load more data (if available) to the matrix
494 if (input != null && readColumn()) {
495 // don't really care, we just read it.
498 // get rid of resources
500 // delete the first column in the matrix
501 Matrix.Column deletedColumn = matrix.columns.remove(0);
503 // remove all shingles seen that include any of the tokens from the deleted column.
504 List<Token> deletedColumnTokens = new ArrayList<Token>();
505 for (Matrix.Column.Row row : deletedColumn.getRows()) {
506 for (Token token : row.getTokens()) {
507 deletedColumnTokens.add(token);
511 for (Iterator<List<Token>> shinglesSeenIterator = shinglesSeen.iterator(); shinglesSeenIterator.hasNext();) {
512 List<Token> shingle = shinglesSeenIterator.next();
513 for (Token deletedColumnToken : deletedColumnTokens) {
514 if (shingle.contains(deletedColumnToken)) {
515 shinglesSeenIterator.remove();
522 if (matrix.columns.size() < minimumShingleSize) {
527 // create permutations of the matrix it now looks
528 permutations = matrix.permutationIterator();
531 nextTokensPermutation();
532 return request_next_token;
537 if (permutations == null) {
538 permutations = matrix.permutationIterator();
541 if (!permutations.hasNext()) {
545 nextTokensPermutation();
547 return request_next_token;
551 * get next permutation of row combinations,
552 * creates list of all tokens in the row and
553 * an index from each such token to what row they exist in.
554 * finally resets the current (next) shingle size and offset.
556 private void nextTokensPermutation() {
557 Matrix.Column.Row[] rowsPermutation = permutations.next();
558 List<Matrix.Column.Row> currentPermutationRows = new ArrayList<Matrix.Column.Row>();
559 List<Token> currentPermuationTokens = new ArrayList<Token>();
560 for (Matrix.Column.Row row : rowsPermutation) {
561 for (Token token : row.getTokens()) {
562 currentPermuationTokens.add(token);
563 currentPermutationRows.add(row);
566 this.currentPermuationTokens = currentPermuationTokens;
567 this.currentPermutationRows = currentPermutationRows;
569 currentPermutationTokensStartOffset = 0;
570 currentShingleLength = minimumShingleSize - 1;
575 * Final touch of a shingle token before it is passed on to the consumer from method {@link #incrementToken()}.
577 * Calculates and sets type, flags, position increment, start/end offsets and weight.
579 * @param token Shingle token
580 * @param shingle Tokens used to produce the shingle token.
581 * @param currentPermutationStartOffset Start offset in parameter currentPermutationTokens
582 * @param currentPermutationRows index to Matrix.Column.Row from the position of tokens in parameter currentPermutationTokens
583 * @param currentPermuationTokens tokens of the current permutation of rows in the matrix.
585 public void updateToken(Token token, List<Token> shingle, int currentPermutationStartOffset, List<Row> currentPermutationRows, List<Token> currentPermuationTokens) {
586 token.setType(ShingleMatrixFilter.class.getName());
588 token.setPositionIncrement(1);
589 token.setStartOffset(shingle.get(0).startOffset());
590 token.setEndOffset(shingle.get(shingle.size() - 1).endOffset());
591 settingsCodec.setWeight(token, calculateShingleWeight(token, shingle, currentPermutationStartOffset, currentPermutationRows, currentPermuationTokens));
595 * Evaluates the new shingle token weight.
597 * for (shingle part token in shingle)
598 * weight += shingle part token weight * (1 / sqrt(all shingle part token weights summed))
600 * This algorithm gives a slightly greater score for longer shingles
601 * and is rather penalising to great shingle token part weights.
603 * @param shingleToken token returned to consumer
604 * @param shingle tokens the tokens used to produce the shingle token.
605 * @param currentPermutationStartOffset start offset in parameter currentPermutationRows and currentPermutationTokens.
606 * @param currentPermutationRows an index to what matrix row a token in parameter currentPermutationTokens exist.
607 * @param currentPermuationTokens all tokens in the current row permutation of the matrix. A sub list (parameter offset, parameter shingle.size) equals parameter shingle.
608 * @return weight to be set for parameter shingleToken
610 public float calculateShingleWeight(Token shingleToken, List<Token> shingle, int currentPermutationStartOffset, List<Row> currentPermutationRows, List<Token> currentPermuationTokens) {
611 double[] weights = new double[shingle.size()];
617 for (int i=0; i<weights.length; i++) {
618 weights[i] = settingsCodec.getWeight(shingle.get(i));
620 double tmp = weights[i];
627 double factor = 1d / Math.sqrt(total);
630 for (double partWeight : weights) {
631 weight += partWeight * factor;
634 return (float) weight;
638 private Token readColumnBuf;
641 * Loads one column from the token stream.
643 * When the last token is read from the token stream it will column.setLast(true);
645 * @return true if it manage to read one more column from the input token stream
646 * @throws IOException if the matrix source input stream throws an exception
648 private boolean readColumn() throws IOException {
651 if (readColumnBuf != null) {
652 token = readColumnBuf;
653 readColumnBuf = null;
655 token = getNextInputToken(new Token());
662 Matrix.Column currentReaderColumn = matrix.new Column();
663 Matrix.Column.Row currentReaderRow = currentReaderColumn.new Row();
665 currentReaderRow.getTokens().add(token);
666 TokenPositioner tokenPositioner;
667 while ((readColumnBuf = getNextInputToken(new Token())) != null
668 && (tokenPositioner = settingsCodec.getTokenPositioner(readColumnBuf)) != TokenPositioner.newColumn) {
670 if (tokenPositioner == TokenPositioner.sameRow) {
671 currentReaderRow.getTokens().add(readColumnBuf);
672 } else /*if (tokenPositioner == TokenPositioner.newRow)*/ {
673 currentReaderRow = currentReaderColumn.new Row();
674 currentReaderRow.getTokens().add(readColumnBuf);
676 readColumnBuf = null;
680 if (readColumnBuf == null) {
681 readColumnBuf = getNextInputToken(new Token());
682 if (readColumnBuf == null) {
683 currentReaderColumn.setLast(true);
694 * A column focused matrix in three dimensions:
697 * Token[column][row][z-axis] {
698 * {{hello}, {greetings, and, salutations}},
699 * {{world}, {earth}, {tellus}}
703 * todo consider row groups
704 * to indicate that shingles is only to contain permutations with texts in that same row group.
707 public static class Matrix {
709 private boolean columnsHasBeenCreated = false;
711 private List<Column> columns = new ArrayList<Column>();
713 public List<Column> getColumns() {
717 public class Column {
719 private boolean last;
720 private boolean first;
722 public Matrix getMatrix() {
726 public Column(Token token) {
729 row.getTokens().add(token);
733 synchronized (Matrix.this) {
734 if (!columnsHasBeenCreated) {
736 columnsHasBeenCreated = true;
739 Matrix.this.columns.add(this);
742 private List<Row> rows = new ArrayList<Row>();
744 public List<Row> getRows() {
749 public int getIndex() {
750 return Matrix.this.columns.indexOf(this);
754 public String toString() {
762 public boolean isFirst() {
766 public void setFirst(boolean first) {
770 public void setLast(boolean last) {
774 public boolean isLast() {
780 public Column getColumn() {
784 private List<Token> tokens = new LinkedList<Token>();
787 Column.this.rows.add(this);
790 public int getIndex() {
791 return Column.this.rows.indexOf(this);
794 public List<Token> getTokens() {
798 public void setTokens(List<Token> tokens) {
799 this.tokens = tokens;
802 // public int getStartOffset() {
803 // int ret = tokens[0].startOffset();
804 // if (getIndex() > 0 && ret == 0) {
805 // ret = Column.this.rows.get(0).getStartOffset();
810 // public int getEndOffset() {
811 // int ret = tokens[tokens.length - 1].endOffset();
812 // if (getIndex() > 0 && ret == 0) {
813 // ret = Column.this.rows.get(0).getEndOffset();
819 public String toString() {
821 "index=" + getIndex() +
822 ", tokens=" + (tokens == null ? null : tokens) +
830 public Iterator<Column.Row[]> permutationIterator() {
832 return new Iterator<Column.Row[]>() {
834 private int[] columnRowCounters = new int[columns.size()];
836 public void remove() {
837 throw new IllegalStateException("not implemented");
840 public boolean hasNext() {
841 int s = columnRowCounters.length;
842 int n = columns.size();
843 return s != 0 && n >= s && columnRowCounters[s - 1] < (columns.get(s - 1)).getRows().size();
846 public Column.Row[] next() {
848 throw new NoSuchElementException("no more elements");
851 Column.Row[] rows = new Column.Row[columnRowCounters.length];
853 for (int i = 0; i < columnRowCounters.length; i++) {
854 rows[i] = columns.get(i).rows.get(columnRowCounters[i]);
856 incrementColumnRowCounters();
861 private void incrementColumnRowCounters() {
862 for (int i = 0; i < columnRowCounters.length; i++) {
863 columnRowCounters[i]++;
864 if (columnRowCounters[i] == columns.get(i).rows.size() &&
865 i < columnRowCounters.length - 1) {
866 columnRowCounters[i] = 0;
876 public String toString() {
878 "columns=" + columns +
884 public int getMinimumShingleSize() {
885 return minimumShingleSize;
888 public void setMinimumShingleSize(int minimumShingleSize) {
889 this.minimumShingleSize = minimumShingleSize;
892 public int getMaximumShingleSize() {
893 return maximumShingleSize;
896 public void setMaximumShingleSize(int maximumShingleSize) {
897 this.maximumShingleSize = maximumShingleSize;
901 public Matrix getMatrix() {
905 public void setMatrix(Matrix matrix) {
906 this.matrix = matrix;
909 public Character getSpacerCharacter() {
910 return spacerCharacter;
913 public void setSpacerCharacter(Character spacerCharacter) {
914 this.spacerCharacter = spacerCharacter;
917 public boolean isIgnoringSinglePrefixOrSuffixShingle() {
918 return ignoringSinglePrefixOrSuffixShingle;
921 public void setIgnoringSinglePrefixOrSuffixShingle(boolean ignoringSinglePrefixOrSuffixShingle) {
922 this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
926 * Using this codec makes a {@link ShingleMatrixFilter} act like {@link org.apache.lucene.analysis.shingle.ShingleFilter}.
927 * It produces the most simple sort of shingles, ignoring token position increments, et c.
929 * It adds each token as a new column.
931 public static class OneDimensionalNonWeightedTokenSettingsCodec extends TokenSettingsCodec {
934 public TokenPositioner getTokenPositioner(Token token) throws IOException {
935 return TokenPositioner.newColumn;
939 public void setTokenPositioner(Token token, TokenPositioner tokenPositioner) {
943 public float getWeight(Token token) {
948 public void setWeight(Token token, float weight) {
955 * A codec that creates a two dimensional matrix
956 * by treating tokens from the input stream with 0 position increment
957 * as new rows to the current column.
959 public static class TwoDimensionalNonWeightedSynonymTokenSettingsCodec extends TokenSettingsCodec {
962 public TokenPositioner getTokenPositioner(Token token) throws IOException {
963 if (token.getPositionIncrement() == 0) {
964 return TokenPositioner.newRow;
966 return TokenPositioner.newColumn;
971 public void setTokenPositioner(Token token, TokenPositioner tokenPositioner) {
972 throw new UnsupportedOperationException();
976 public float getWeight(Token token) {
981 public void setWeight(Token token, float weight) {
987 * A full featured codec not to be used for something serious.
989 * It takes complete control of
991 * and the bit flags for positioning in the matrix.
993 * Mainly exist for demonstrational purposes.
995 public static class SimpleThreeDimensionalTokenSettingsCodec extends TokenSettingsCodec {
999 * @return the token flags int value as TokenPosition
1000 * @throws IOException
1003 public TokenPositioner getTokenPositioner(Token token) throws IOException {
1004 switch (token.getFlags()) {
1006 return TokenPositioner.newColumn;
1008 return TokenPositioner.newRow;
1010 return TokenPositioner.sameRow;
1012 throw new IOException("Unknown matrix positioning of token " + token);
1016 * Sets the TokenPositioner as token flags int value.
1019 * @param tokenPositioner
1022 public void setTokenPositioner(Token token, TokenPositioner tokenPositioner) {
1023 token.setFlags(tokenPositioner.getIndex());
1027 * Returns a 32 bit float from the payload, or 1f it null.
1030 * @return 32 bit float
1033 public float getWeight(Token token) {
1034 if (token.getPayload() == null || token.getPayload().getData() == null) {
1037 return PayloadHelper.decodeFloat(token.getPayload().getData());
1042 * Stores a 32 bit float in the payload, or set it to null if 1f;
1047 public void setWeight(Token token, float weight) {
1049 token.setPayload(null);
1051 token.setPayload(new Payload(PayloadHelper.encodeFloat(weight)));