X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/analysis/Token.java?ds=sidebyside diff --git a/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/analysis/Token.java b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/analysis/Token.java new file mode 100644 index 0000000..152e02d --- /dev/null +++ b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/analysis/Token.java @@ -0,0 +1,648 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.tokenattributes.TermAttributeImpl; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.index.Payload; +import org.apache.lucene.index.TermPositions; // for javadoc +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.AttributeImpl; +import org.apache.lucene.util.AttributeReflector; + +/** + A Token is an occurrence of a term from the text of a field. It consists of + a term's text, the start and end offset of the term in the text of the field, + and a type string. +

+ The start and end offsets permit applications to re-associate a token with + its source text, e.g., to display highlighted query terms in a document + browser, or to show matching text fragments in a KWIC + display, etc. +

+ The type is a string, assigned by a lexical analyzer + (a.k.a. tokenizer), naming the lexical or syntactic class that the token + belongs to. For example an end of sentence marker token might be implemented + with type "eos". The default token type is "word". +

+ A Token can optionally have metadata (a.k.a. Payload) in the form of a variable + length byte array. Use {@link TermPositions#getPayloadLength()} and + {@link TermPositions#getPayload(byte[], int)} to retrieve the payloads from the index. + +

+ +

NOTE: As of 2.9, Token implements all {@link Attribute} interfaces + that are part of core Lucene and can be found in the {@code tokenattributes} subpackage. + Even though it is not necessary to use Token anymore, with the new TokenStream API it can + be used as convenience class that implements all {@link Attribute}s, which is especially useful + to easily switch from the old to the new TokenStream API. + +

+ +

Tokenizers and TokenFilters should try to re-use a Token + instance when possible for best performance, by + implementing the {@link TokenStream#incrementToken()} API. + Failing that, to create a new Token you should first use + one of the constructors that starts with null text. To load + the token from a char[] use {@link #copyBuffer(char[], int, int)}. + To load from a String use {@link #setEmpty} followed by {@link #append(CharSequence)} or {@link #append(CharSequence, int, int)}. + Alternatively you can get the Token's termBuffer by calling either {@link #buffer()}, + if you know that your text is shorter than the capacity of the termBuffer + or {@link #resizeBuffer(int)}, if there is any possibility + that you may need to grow the buffer. Fill in the characters of your term into this + buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string, + or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setLength(int)} to + set the length of the term text. See LUCENE-969 + for details.

+

Typical Token reuse patterns: +

+ A few things to note: + +

+

+ Please note: With Lucene 3.1, the {@linkplain #toString toString()} method had to be changed to match the + {@link CharSequence} interface introduced by the interface {@link org.apache.lucene.analysis.tokenattributes.CharTermAttribute}. + This method now only prints the term text, no additional information anymore. +

+ @see org.apache.lucene.index.Payload +*/ +// TODO: change superclass to CharTermAttribute in 4.0! Maybe deprecate the whole class? +public class Token extends TermAttributeImpl + implements TypeAttribute, PositionIncrementAttribute, + FlagsAttribute, OffsetAttribute, PayloadAttribute { + + private int startOffset,endOffset; + private String type = DEFAULT_TYPE; + private int flags; + private Payload payload; + private int positionIncrement = 1; + + /** Constructs a Token will null text. */ + public Token() { + } + + /** Constructs a Token with null text and start & end + * offsets. + * @param start start offset in the source text + * @param end end offset in the source text */ + public Token(int start, int end) { + startOffset = start; + endOffset = end; + } + + /** Constructs a Token with null text and start & end + * offsets plus the Token type. + * @param start start offset in the source text + * @param end end offset in the source text + * @param typ the lexical type of this Token */ + public Token(int start, int end, String typ) { + startOffset = start; + endOffset = end; + type = typ; + } + + /** + * Constructs a Token with null text and start & end + * offsets plus flags. NOTE: flags is EXPERIMENTAL. + * @param start start offset in the source text + * @param end end offset in the source text + * @param flags The bits to set for this token + */ + public Token(int start, int end, int flags) { + startOffset = start; + endOffset = end; + this.flags = flags; + } + + /** Constructs a Token with the given term text, and start + * & end offsets. The type defaults to "word." + * NOTE: for better indexing speed you should + * instead use the char[] termBuffer methods to set the + * term text. + * @param text term text + * @param start start offset + * @param end end offset + */ + public Token(String text, int start, int end) { + append(text); + startOffset = start; + endOffset = end; + } + + /** Constructs a Token with the given text, start and end + * offsets, & type. NOTE: for better indexing + * speed you should instead use the char[] termBuffer + * methods to set the term text. + * @param text term text + * @param start start offset + * @param end end offset + * @param typ token type + */ + public Token(String text, int start, int end, String typ) { + append(text); + startOffset = start; + endOffset = end; + type = typ; + } + + /** + * Constructs a Token with the given text, start and end + * offsets, & type. NOTE: for better indexing + * speed you should instead use the char[] termBuffer + * methods to set the term text. + * @param text + * @param start + * @param end + * @param flags token type bits + */ + public Token(String text, int start, int end, int flags) { + append(text); + startOffset = start; + endOffset = end; + this.flags = flags; + } + + /** + * Constructs a Token with the given term buffer (offset + * & length), start and end + * offsets + * @param startTermBuffer + * @param termBufferOffset + * @param termBufferLength + * @param start + * @param end + */ + public Token(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end) { + copyBuffer(startTermBuffer, termBufferOffset, termBufferLength); + startOffset = start; + endOffset = end; + } + + /** Set the position increment. This determines the position of this token + * relative to the previous Token in a {@link TokenStream}, used in phrase + * searching. + * + *

The default value is one. + * + *

Some common uses for this are:

+ * @param positionIncrement the distance from the prior term + * @see org.apache.lucene.index.TermPositions + */ + public void setPositionIncrement(int positionIncrement) { + if (positionIncrement < 0) + throw new IllegalArgumentException + ("Increment must be zero or greater: " + positionIncrement); + this.positionIncrement = positionIncrement; + } + + /** Returns the position increment of this Token. + * @see #setPositionIncrement + */ + public int getPositionIncrement() { + return positionIncrement; + } + + /** Returns this Token's starting offset, the position of the first character + corresponding to this token in the source text. + + Note that the difference between endOffset() and startOffset() may not be + equal to {@link #length}, as the term text may have been altered by a + stemmer or some other filter. */ + public final int startOffset() { + return startOffset; + } + + /** Set the starting offset. + @see #startOffset() */ + public void setStartOffset(int offset) { + this.startOffset = offset; + } + + /** Returns this Token's ending offset, one greater than the position of the + last character corresponding to this token in the source text. The length + of the token in the source text is (endOffset - startOffset). */ + public final int endOffset() { + return endOffset; + } + + /** Set the ending offset. + @see #endOffset() */ + public void setEndOffset(int offset) { + this.endOffset = offset; + } + + /** Set the starting and ending offset. + @see #startOffset() and #endOffset()*/ + public void setOffset(int startOffset, int endOffset) { + this.startOffset = startOffset; + this.endOffset = endOffset; + } + + /** Returns this Token's lexical type. Defaults to "word". */ + public final String type() { + return type; + } + + /** Set the lexical type. + @see #type() */ + public final void setType(String type) { + this.type = type; + } + + /** + *

+ * + * Get the bitset for any bits that have been set. This is completely distinct from {@link #type()}, although they do share similar purposes. + * The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s. + * + * + * @return The bits + * @lucene.experimental While we think this is here to stay, we may want to change it to be a long. + */ + public int getFlags() { + return flags; + } + + /** + * @see #getFlags() + */ + public void setFlags(int flags) { + this.flags = flags; + } + + /** + * Returns this Token's payload. + */ + public Payload getPayload() { + return this.payload; + } + + /** + * Sets this Token's payload. + */ + public void setPayload(Payload payload) { + this.payload = payload; + } + + /** Resets the term text, payload, flags, and positionIncrement, + * startOffset, endOffset and token type to default. + */ + @Override + public void clear() { + super.clear(); + payload = null; + positionIncrement = 1; + flags = 0; + startOffset = endOffset = 0; + type = DEFAULT_TYPE; + } + + @Override + public Object clone() { + Token t = (Token)super.clone(); + // Do a deep clone + if (payload != null) { + t.payload = (Payload) payload.clone(); + } + return t; + } + + /** Makes a clone, but replaces the term buffer & + * start/end offset in the process. This is more + * efficient than doing a full clone (and then calling + * {@link #copyBuffer}) because it saves a wasted copy of the old + * termBuffer. */ + public Token clone(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) { + final Token t = new Token(newTermBuffer, newTermOffset, newTermLength, newStartOffset, newEndOffset); + t.positionIncrement = positionIncrement; + t.flags = flags; + t.type = type; + if (payload != null) + t.payload = (Payload) payload.clone(); + return t; + } + + @Override + public boolean equals(Object obj) { + if (obj == this) + return true; + + if (obj instanceof Token) { + final Token other = (Token) obj; + return (startOffset == other.startOffset && + endOffset == other.endOffset && + flags == other.flags && + positionIncrement == other.positionIncrement && + (type == null ? other.type == null : type.equals(other.type)) && + (payload == null ? other.payload == null : payload.equals(other.payload)) && + super.equals(obj) + ); + } else + return false; + } + + @Override + public int hashCode() { + int code = super.hashCode(); + code = code * 31 + startOffset; + code = code * 31 + endOffset; + code = code * 31 + flags; + code = code * 31 + positionIncrement; + if (type != null) + code = code * 31 + type.hashCode(); + if (payload != null) + code = code * 31 + payload.hashCode(); + return code; + } + + // like clear() but doesn't clear termBuffer/text + private void clearNoTermBuffer() { + payload = null; + positionIncrement = 1; + flags = 0; + startOffset = endOffset = 0; + type = DEFAULT_TYPE; + } + + /** Shorthand for calling {@link #clear}, + * {@link #copyBuffer(char[], int, int)}, + * {@link #setStartOffset}, + * {@link #setEndOffset}, + * {@link #setType} + * @return this Token instance */ + public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) { + clearNoTermBuffer(); + copyBuffer(newTermBuffer, newTermOffset, newTermLength); + payload = null; + positionIncrement = 1; + startOffset = newStartOffset; + endOffset = newEndOffset; + type = newType; + return this; + } + + /** Shorthand for calling {@link #clear}, + * {@link #copyBuffer(char[], int, int)}, + * {@link #setStartOffset}, + * {@link #setEndOffset} + * {@link #setType} on Token.DEFAULT_TYPE + * @return this Token instance */ + public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) { + clearNoTermBuffer(); + copyBuffer(newTermBuffer, newTermOffset, newTermLength); + startOffset = newStartOffset; + endOffset = newEndOffset; + type = DEFAULT_TYPE; + return this; + } + + /** Shorthand for calling {@link #clear}, + * {@link #append(CharSequence)}, + * {@link #setStartOffset}, + * {@link #setEndOffset} + * {@link #setType} + * @return this Token instance */ + public Token reinit(String newTerm, int newStartOffset, int newEndOffset, String newType) { + clear(); + append(newTerm); + startOffset = newStartOffset; + endOffset = newEndOffset; + type = newType; + return this; + } + + /** Shorthand for calling {@link #clear}, + * {@link #append(CharSequence, int, int)}, + * {@link #setStartOffset}, + * {@link #setEndOffset} + * {@link #setType} + * @return this Token instance */ + public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) { + clear(); + append(newTerm, newTermOffset, newTermOffset + newTermLength); + startOffset = newStartOffset; + endOffset = newEndOffset; + type = newType; + return this; + } + + /** Shorthand for calling {@link #clear}, + * {@link #append(CharSequence)}, + * {@link #setStartOffset}, + * {@link #setEndOffset} + * {@link #setType} on Token.DEFAULT_TYPE + * @return this Token instance */ + public Token reinit(String newTerm, int newStartOffset, int newEndOffset) { + clear(); + append(newTerm); + startOffset = newStartOffset; + endOffset = newEndOffset; + type = DEFAULT_TYPE; + return this; + } + + /** Shorthand for calling {@link #clear}, + * {@link #append(CharSequence, int, int)}, + * {@link #setStartOffset}, + * {@link #setEndOffset} + * {@link #setType} on Token.DEFAULT_TYPE + * @return this Token instance */ + public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) { + clear(); + append(newTerm, newTermOffset, newTermOffset + newTermLength); + startOffset = newStartOffset; + endOffset = newEndOffset; + type = DEFAULT_TYPE; + return this; + } + + /** + * Copy the prototype token's fields into this one. Note: Payloads are shared. + * @param prototype + */ + public void reinit(Token prototype) { + copyBuffer(prototype.buffer(), 0, prototype.length()); + positionIncrement = prototype.positionIncrement; + flags = prototype.flags; + startOffset = prototype.startOffset; + endOffset = prototype.endOffset; + type = prototype.type; + payload = prototype.payload; + } + + /** + * Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared. + * @param prototype + * @param newTerm + */ + public void reinit(Token prototype, String newTerm) { + setEmpty().append(newTerm); + positionIncrement = prototype.positionIncrement; + flags = prototype.flags; + startOffset = prototype.startOffset; + endOffset = prototype.endOffset; + type = prototype.type; + payload = prototype.payload; + } + + /** + * Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared. + * @param prototype + * @param newTermBuffer + * @param offset + * @param length + */ + public void reinit(Token prototype, char[] newTermBuffer, int offset, int length) { + copyBuffer(newTermBuffer, offset, length); + positionIncrement = prototype.positionIncrement; + flags = prototype.flags; + startOffset = prototype.startOffset; + endOffset = prototype.endOffset; + type = prototype.type; + payload = prototype.payload; + } + + @Override + public void copyTo(AttributeImpl target) { + if (target instanceof Token) { + final Token to = (Token) target; + to.reinit(this); + // reinit shares the payload, so clone it: + if (payload !=null) { + to.payload = (Payload) payload.clone(); + } + } else { + super.copyTo(target); + ((OffsetAttribute) target).setOffset(startOffset, endOffset); + ((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement); + ((PayloadAttribute) target).setPayload((payload == null) ? null : (Payload) payload.clone()); + ((FlagsAttribute) target).setFlags(flags); + ((TypeAttribute) target).setType(type); + } + } + + @Override + public void reflectWith(AttributeReflector reflector) { + super.reflectWith(reflector); + reflector.reflect(OffsetAttribute.class, "startOffset", startOffset); + reflector.reflect(OffsetAttribute.class, "endOffset", endOffset); + reflector.reflect(PositionIncrementAttribute.class, "positionIncrement", positionIncrement); + reflector.reflect(PayloadAttribute.class, "payload", payload); + reflector.reflect(FlagsAttribute.class, "flags", flags); + reflector.reflect(TypeAttribute.class, "type", type); + } + + /** Convenience factory that returns Token as implementation for the basic + * attributes and return the default impl (with "Impl" appended) for all other + * attributes. + * @since 3.0 + */ + public static final AttributeSource.AttributeFactory TOKEN_ATTRIBUTE_FACTORY = + new TokenAttributeFactory(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); + + /** Expert: Creates a TokenAttributeFactory returning {@link Token} as instance for the basic attributes + * and for all other attributes calls the given delegate factory. + * @since 3.0 + */ + public static final class TokenAttributeFactory extends AttributeSource.AttributeFactory { + + private final AttributeSource.AttributeFactory delegate; + + /** Expert: Creates an AttributeFactory returning {@link Token} as instance for the basic attributes + * and for all other attributes calls the given delegate factory. */ + public TokenAttributeFactory(AttributeSource.AttributeFactory delegate) { + this.delegate = delegate; + } + + @Override + public AttributeImpl createAttributeInstance(Class attClass) { + return attClass.isAssignableFrom(Token.class) + ? new Token() : delegate.createAttributeInstance(attClass); + } + + @Override + public boolean equals(Object other) { + if (this == other) return true; + if (other instanceof TokenAttributeFactory) { + final TokenAttributeFactory af = (TokenAttributeFactory) other; + return this.delegate.equals(af.delegate); + } + return false; + } + + @Override + public int hashCode() { + return delegate.hashCode() ^ 0x0a45aa31; + } + } + +}