+++ /dev/null
-package org.apache.lucene.analysis;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import org.apache.lucene.analysis.tokenattributes.TermAttributeImpl;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
-import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.index.Payload;
-import org.apache.lucene.index.TermPositions; // for javadoc
-import org.apache.lucene.util.Attribute;
-import org.apache.lucene.util.AttributeSource;
-import org.apache.lucene.util.AttributeImpl;
-import org.apache.lucene.util.AttributeReflector;
-
-/**
- A Token is an occurrence of a term from the text of a field. It consists of
- a term's text, the start and end offset of the term in the text of the field,
- and a type string.
- <p>
- The start and end offsets permit applications to re-associate a token with
- its source text, e.g., to display highlighted query terms in a document
- browser, or to show matching text fragments in a <abbr title="KeyWord In Context">KWIC</abbr>
- display, etc.
- <p>
- The type is a string, assigned by a lexical analyzer
- (a.k.a. tokenizer), naming the lexical or syntactic class that the token
- belongs to. For example an end of sentence marker token might be implemented
- with type "eos". The default token type is "word".
- <p>
- A Token can optionally have metadata (a.k.a. Payload) in the form of a variable
- length byte array. Use {@link TermPositions#getPayloadLength()} and
- {@link TermPositions#getPayload(byte[], int)} to retrieve the payloads from the index.
-
- <br><br>
-
- <p><b>NOTE:</b> As of 2.9, Token implements all {@link Attribute} interfaces
- that are part of core Lucene and can be found in the {@code tokenattributes} subpackage.
- Even though it is not necessary to use Token anymore, with the new TokenStream API it can
- be used as convenience class that implements all {@link Attribute}s, which is especially useful
- to easily switch from the old to the new TokenStream API.
-
- <br><br>
-
- <p>Tokenizers and TokenFilters should try to re-use a Token
- instance when possible for best performance, by
- implementing the {@link TokenStream#incrementToken()} API.
- Failing that, to create a new Token you should first use
- one of the constructors that starts with null text. To load
- the token from a char[] use {@link #copyBuffer(char[], int, int)}.
- To load from a String use {@link #setEmpty} followed by {@link #append(CharSequence)} or {@link #append(CharSequence, int, int)}.
- Alternatively you can get the Token's termBuffer by calling either {@link #buffer()},
- if you know that your text is shorter than the capacity of the termBuffer
- or {@link #resizeBuffer(int)}, if there is any possibility
- that you may need to grow the buffer. Fill in the characters of your term into this
- buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string,
- or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setLength(int)} to
- set the length of the term text. See <a target="_top"
- href="https://issues.apache.org/jira/browse/LUCENE-969">LUCENE-969</a>
- for details.</p>
- <p>Typical Token reuse patterns:
- <ul>
- <li> Copying text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
- <pre>
- return reusableToken.reinit(string, startOffset, endOffset[, type]);
- </pre>
- </li>
- <li> Copying some text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
- <pre>
- return reusableToken.reinit(string, 0, string.length(), startOffset, endOffset[, type]);
- </pre>
- </li>
- </li>
- <li> Copying text from char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
- <pre>
- return reusableToken.reinit(buffer, 0, buffer.length, startOffset, endOffset[, type]);
- </pre>
- </li>
- <li> Copying some text from a char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
- <pre>
- return reusableToken.reinit(buffer, start, end - start, startOffset, endOffset[, type]);
- </pre>
- </li>
- <li> Copying from one one Token to another (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
- <pre>
- return reusableToken.reinit(source.buffer(), 0, source.length(), source.startOffset(), source.endOffset()[, source.type()]);
- </pre>
- </li>
- </ul>
- A few things to note:
- <ul>
- <li>clear() initializes all of the fields to default values. This was changed in contrast to Lucene 2.4, but should affect no one.</li>
- <li>Because <code>TokenStreams</code> can be chained, one cannot assume that the <code>Token's</code> current type is correct.</li>
- <li>The startOffset and endOffset represent the start and offset in the source text, so be careful in adjusting them.</li>
- <li>When caching a reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again.</li>
- </ul>
- </p>
- <p>
- <b>Please note:</b> With Lucene 3.1, the <code>{@linkplain #toString toString()}</code> method had to be changed to match the
- {@link CharSequence} interface introduced by the interface {@link org.apache.lucene.analysis.tokenattributes.CharTermAttribute}.
- This method now only prints the term text, no additional information anymore.
- </p>
- @see org.apache.lucene.index.Payload
-*/
-// TODO: change superclass to CharTermAttribute in 4.0! Maybe deprecate the whole class?
-public class Token extends TermAttributeImpl
- implements TypeAttribute, PositionIncrementAttribute,
- FlagsAttribute, OffsetAttribute, PayloadAttribute {
-
- private int startOffset,endOffset;
- private String type = DEFAULT_TYPE;
- private int flags;
- private Payload payload;
- private int positionIncrement = 1;
-
- /** Constructs a Token will null text. */
- public Token() {
- }
-
- /** Constructs a Token with null text and start & end
- * offsets.
- * @param start start offset in the source text
- * @param end end offset in the source text */
- public Token(int start, int end) {
- startOffset = start;
- endOffset = end;
- }
-
- /** Constructs a Token with null text and start & end
- * offsets plus the Token type.
- * @param start start offset in the source text
- * @param end end offset in the source text
- * @param typ the lexical type of this Token */
- public Token(int start, int end, String typ) {
- startOffset = start;
- endOffset = end;
- type = typ;
- }
-
- /**
- * Constructs a Token with null text and start & end
- * offsets plus flags. NOTE: flags is EXPERIMENTAL.
- * @param start start offset in the source text
- * @param end end offset in the source text
- * @param flags The bits to set for this token
- */
- public Token(int start, int end, int flags) {
- startOffset = start;
- endOffset = end;
- this.flags = flags;
- }
-
- /** Constructs a Token with the given term text, and start
- * & end offsets. The type defaults to "word."
- * <b>NOTE:</b> for better indexing speed you should
- * instead use the char[] termBuffer methods to set the
- * term text.
- * @param text term text
- * @param start start offset
- * @param end end offset
- */
- public Token(String text, int start, int end) {
- append(text);
- startOffset = start;
- endOffset = end;
- }
-
- /** Constructs a Token with the given text, start and end
- * offsets, & type. <b>NOTE:</b> for better indexing
- * speed you should instead use the char[] termBuffer
- * methods to set the term text.
- * @param text term text
- * @param start start offset
- * @param end end offset
- * @param typ token type
- */
- public Token(String text, int start, int end, String typ) {
- append(text);
- startOffset = start;
- endOffset = end;
- type = typ;
- }
-
- /**
- * Constructs a Token with the given text, start and end
- * offsets, & type. <b>NOTE:</b> for better indexing
- * speed you should instead use the char[] termBuffer
- * methods to set the term text.
- * @param text
- * @param start
- * @param end
- * @param flags token type bits
- */
- public Token(String text, int start, int end, int flags) {
- append(text);
- startOffset = start;
- endOffset = end;
- this.flags = flags;
- }
-
- /**
- * Constructs a Token with the given term buffer (offset
- * & length), start and end
- * offsets
- * @param startTermBuffer
- * @param termBufferOffset
- * @param termBufferLength
- * @param start
- * @param end
- */
- public Token(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end) {
- copyBuffer(startTermBuffer, termBufferOffset, termBufferLength);
- startOffset = start;
- endOffset = end;
- }
-
- /** Set the position increment. This determines the position of this token
- * relative to the previous Token in a {@link TokenStream}, used in phrase
- * searching.
- *
- * <p>The default value is one.
- *
- * <p>Some common uses for this are:<ul>
- *
- * <li>Set it to zero to put multiple terms in the same position. This is
- * useful if, e.g., a word has multiple stems. Searches for phrases
- * including either stem will match. In this case, all but the first stem's
- * increment should be set to zero: the increment of the first instance
- * should be one. Repeating a token with an increment of zero can also be
- * used to boost the scores of matches on that token.
- *
- * <li>Set it to values greater than one to inhibit exact phrase matches.
- * If, for example, one does not want phrases to match across removed stop
- * words, then one could build a stop word filter that removes stop words and
- * also sets the increment to the number of stop words removed before each
- * non-stop word. Then exact phrase queries will only match when the terms
- * occur with no intervening stop words.
- *
- * </ul>
- * @param positionIncrement the distance from the prior term
- * @see org.apache.lucene.index.TermPositions
- */
- public void setPositionIncrement(int positionIncrement) {
- if (positionIncrement < 0)
- throw new IllegalArgumentException
- ("Increment must be zero or greater: " + positionIncrement);
- this.positionIncrement = positionIncrement;
- }
-
- /** Returns the position increment of this Token.
- * @see #setPositionIncrement
- */
- public int getPositionIncrement() {
- return positionIncrement;
- }
-
- /** Returns this Token's starting offset, the position of the first character
- corresponding to this token in the source text.
-
- Note that the difference between endOffset() and startOffset() may not be
- equal to {@link #length}, as the term text may have been altered by a
- stemmer or some other filter. */
- public final int startOffset() {
- return startOffset;
- }
-
- /** Set the starting offset.
- @see #startOffset() */
- public void setStartOffset(int offset) {
- this.startOffset = offset;
- }
-
- /** Returns this Token's ending offset, one greater than the position of the
- last character corresponding to this token in the source text. The length
- of the token in the source text is (endOffset - startOffset). */
- public final int endOffset() {
- return endOffset;
- }
-
- /** Set the ending offset.
- @see #endOffset() */
- public void setEndOffset(int offset) {
- this.endOffset = offset;
- }
-
- /** Set the starting and ending offset.
- @see #startOffset() and #endOffset()*/
- public void setOffset(int startOffset, int endOffset) {
- this.startOffset = startOffset;
- this.endOffset = endOffset;
- }
-
- /** Returns this Token's lexical type. Defaults to "word". */
- public final String type() {
- return type;
- }
-
- /** Set the lexical type.
- @see #type() */
- public final void setType(String type) {
- this.type = type;
- }
-
- /**
- * <p/>
- *
- * Get the bitset for any bits that have been set. This is completely distinct from {@link #type()}, although they do share similar purposes.
- * The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s.
- *
- *
- * @return The bits
- * @lucene.experimental While we think this is here to stay, we may want to change it to be a long.
- */
- public int getFlags() {
- return flags;
- }
-
- /**
- * @see #getFlags()
- */
- public void setFlags(int flags) {
- this.flags = flags;
- }
-
- /**
- * Returns this Token's payload.
- */
- public Payload getPayload() {
- return this.payload;
- }
-
- /**
- * Sets this Token's payload.
- */
- public void setPayload(Payload payload) {
- this.payload = payload;
- }
-
- /** Resets the term text, payload, flags, and positionIncrement,
- * startOffset, endOffset and token type to default.
- */
- @Override
- public void clear() {
- super.clear();
- payload = null;
- positionIncrement = 1;
- flags = 0;
- startOffset = endOffset = 0;
- type = DEFAULT_TYPE;
- }
-
- @Override
- public Object clone() {
- Token t = (Token)super.clone();
- // Do a deep clone
- if (payload != null) {
- t.payload = (Payload) payload.clone();
- }
- return t;
- }
-
- /** Makes a clone, but replaces the term buffer &
- * start/end offset in the process. This is more
- * efficient than doing a full clone (and then calling
- * {@link #copyBuffer}) because it saves a wasted copy of the old
- * termBuffer. */
- public Token clone(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
- final Token t = new Token(newTermBuffer, newTermOffset, newTermLength, newStartOffset, newEndOffset);
- t.positionIncrement = positionIncrement;
- t.flags = flags;
- t.type = type;
- if (payload != null)
- t.payload = (Payload) payload.clone();
- return t;
- }
-
- @Override
- public boolean equals(Object obj) {
- if (obj == this)
- return true;
-
- if (obj instanceof Token) {
- final Token other = (Token) obj;
- return (startOffset == other.startOffset &&
- endOffset == other.endOffset &&
- flags == other.flags &&
- positionIncrement == other.positionIncrement &&
- (type == null ? other.type == null : type.equals(other.type)) &&
- (payload == null ? other.payload == null : payload.equals(other.payload)) &&
- super.equals(obj)
- );
- } else
- return false;
- }
-
- @Override
- public int hashCode() {
- int code = super.hashCode();
- code = code * 31 + startOffset;
- code = code * 31 + endOffset;
- code = code * 31 + flags;
- code = code * 31 + positionIncrement;
- if (type != null)
- code = code * 31 + type.hashCode();
- if (payload != null)
- code = code * 31 + payload.hashCode();
- return code;
- }
-
- // like clear() but doesn't clear termBuffer/text
- private void clearNoTermBuffer() {
- payload = null;
- positionIncrement = 1;
- flags = 0;
- startOffset = endOffset = 0;
- type = DEFAULT_TYPE;
- }
-
- /** Shorthand for calling {@link #clear},
- * {@link #copyBuffer(char[], int, int)},
- * {@link #setStartOffset},
- * {@link #setEndOffset},
- * {@link #setType}
- * @return this Token instance */
- public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) {
- clearNoTermBuffer();
- copyBuffer(newTermBuffer, newTermOffset, newTermLength);
- payload = null;
- positionIncrement = 1;
- startOffset = newStartOffset;
- endOffset = newEndOffset;
- type = newType;
- return this;
- }
-
- /** Shorthand for calling {@link #clear},
- * {@link #copyBuffer(char[], int, int)},
- * {@link #setStartOffset},
- * {@link #setEndOffset}
- * {@link #setType} on Token.DEFAULT_TYPE
- * @return this Token instance */
- public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
- clearNoTermBuffer();
- copyBuffer(newTermBuffer, newTermOffset, newTermLength);
- startOffset = newStartOffset;
- endOffset = newEndOffset;
- type = DEFAULT_TYPE;
- return this;
- }
-
- /** Shorthand for calling {@link #clear},
- * {@link #append(CharSequence)},
- * {@link #setStartOffset},
- * {@link #setEndOffset}
- * {@link #setType}
- * @return this Token instance */
- public Token reinit(String newTerm, int newStartOffset, int newEndOffset, String newType) {
- clear();
- append(newTerm);
- startOffset = newStartOffset;
- endOffset = newEndOffset;
- type = newType;
- return this;
- }
-
- /** Shorthand for calling {@link #clear},
- * {@link #append(CharSequence, int, int)},
- * {@link #setStartOffset},
- * {@link #setEndOffset}
- * {@link #setType}
- * @return this Token instance */
- public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) {
- clear();
- append(newTerm, newTermOffset, newTermOffset + newTermLength);
- startOffset = newStartOffset;
- endOffset = newEndOffset;
- type = newType;
- return this;
- }
-
- /** Shorthand for calling {@link #clear},
- * {@link #append(CharSequence)},
- * {@link #setStartOffset},
- * {@link #setEndOffset}
- * {@link #setType} on Token.DEFAULT_TYPE
- * @return this Token instance */
- public Token reinit(String newTerm, int newStartOffset, int newEndOffset) {
- clear();
- append(newTerm);
- startOffset = newStartOffset;
- endOffset = newEndOffset;
- type = DEFAULT_TYPE;
- return this;
- }
-
- /** Shorthand for calling {@link #clear},
- * {@link #append(CharSequence, int, int)},
- * {@link #setStartOffset},
- * {@link #setEndOffset}
- * {@link #setType} on Token.DEFAULT_TYPE
- * @return this Token instance */
- public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
- clear();
- append(newTerm, newTermOffset, newTermOffset + newTermLength);
- startOffset = newStartOffset;
- endOffset = newEndOffset;
- type = DEFAULT_TYPE;
- return this;
- }
-
- /**
- * Copy the prototype token's fields into this one. Note: Payloads are shared.
- * @param prototype
- */
- public void reinit(Token prototype) {
- copyBuffer(prototype.buffer(), 0, prototype.length());
- positionIncrement = prototype.positionIncrement;
- flags = prototype.flags;
- startOffset = prototype.startOffset;
- endOffset = prototype.endOffset;
- type = prototype.type;
- payload = prototype.payload;
- }
-
- /**
- * Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared.
- * @param prototype
- * @param newTerm
- */
- public void reinit(Token prototype, String newTerm) {
- setEmpty().append(newTerm);
- positionIncrement = prototype.positionIncrement;
- flags = prototype.flags;
- startOffset = prototype.startOffset;
- endOffset = prototype.endOffset;
- type = prototype.type;
- payload = prototype.payload;
- }
-
- /**
- * Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared.
- * @param prototype
- * @param newTermBuffer
- * @param offset
- * @param length
- */
- public void reinit(Token prototype, char[] newTermBuffer, int offset, int length) {
- copyBuffer(newTermBuffer, offset, length);
- positionIncrement = prototype.positionIncrement;
- flags = prototype.flags;
- startOffset = prototype.startOffset;
- endOffset = prototype.endOffset;
- type = prototype.type;
- payload = prototype.payload;
- }
-
- @Override
- public void copyTo(AttributeImpl target) {
- if (target instanceof Token) {
- final Token to = (Token) target;
- to.reinit(this);
- // reinit shares the payload, so clone it:
- if (payload !=null) {
- to.payload = (Payload) payload.clone();
- }
- } else {
- super.copyTo(target);
- ((OffsetAttribute) target).setOffset(startOffset, endOffset);
- ((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement);
- ((PayloadAttribute) target).setPayload((payload == null) ? null : (Payload) payload.clone());
- ((FlagsAttribute) target).setFlags(flags);
- ((TypeAttribute) target).setType(type);
- }
- }
-
- @Override
- public void reflectWith(AttributeReflector reflector) {
- super.reflectWith(reflector);
- reflector.reflect(OffsetAttribute.class, "startOffset", startOffset);
- reflector.reflect(OffsetAttribute.class, "endOffset", endOffset);
- reflector.reflect(PositionIncrementAttribute.class, "positionIncrement", positionIncrement);
- reflector.reflect(PayloadAttribute.class, "payload", payload);
- reflector.reflect(FlagsAttribute.class, "flags", flags);
- reflector.reflect(TypeAttribute.class, "type", type);
- }
-
- /** Convenience factory that returns <code>Token</code> as implementation for the basic
- * attributes and return the default impl (with "Impl" appended) for all other
- * attributes.
- * @since 3.0
- */
- public static final AttributeSource.AttributeFactory TOKEN_ATTRIBUTE_FACTORY =
- new TokenAttributeFactory(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
-
- /** <b>Expert:</b> Creates a TokenAttributeFactory returning {@link Token} as instance for the basic attributes
- * and for all other attributes calls the given delegate factory.
- * @since 3.0
- */
- public static final class TokenAttributeFactory extends AttributeSource.AttributeFactory {
-
- private final AttributeSource.AttributeFactory delegate;
-
- /** <b>Expert</b>: Creates an AttributeFactory returning {@link Token} as instance for the basic attributes
- * and for all other attributes calls the given delegate factory. */
- public TokenAttributeFactory(AttributeSource.AttributeFactory delegate) {
- this.delegate = delegate;
- }
-
- @Override
- public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
- return attClass.isAssignableFrom(Token.class)
- ? new Token() : delegate.createAttributeInstance(attClass);
- }
-
- @Override
- public boolean equals(Object other) {
- if (this == other) return true;
- if (other instanceof TokenAttributeFactory) {
- final TokenAttributeFactory af = (TokenAttributeFactory) other;
- return this.delegate.equals(af.delegate);
- }
- return false;
- }
-
- @Override
- public int hashCode() {
- return delegate.hashCode() ^ 0x0a45aa31;
- }
- }
-
-}