X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/Token.java diff --git a/lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/Token.java b/lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/Token.java deleted file mode 100644 index 152e02d..0000000 --- a/lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/Token.java +++ /dev/null @@ -1,648 +0,0 @@ -package org.apache.lucene.analysis; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.tokenattributes.TermAttributeImpl; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; -import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; -import org.apache.lucene.index.Payload; -import org.apache.lucene.index.TermPositions; // for javadoc -import org.apache.lucene.util.Attribute; -import org.apache.lucene.util.AttributeSource; -import org.apache.lucene.util.AttributeImpl; -import org.apache.lucene.util.AttributeReflector; - -/** - A Token is an occurrence of a term from the text of a field. It consists of - a term's text, the start and end offset of the term in the text of the field, - and a type string. -

- The start and end offsets permit applications to re-associate a token with - its source text, e.g., to display highlighted query terms in a document - browser, or to show matching text fragments in a KWIC - display, etc. -

- The type is a string, assigned by a lexical analyzer - (a.k.a. tokenizer), naming the lexical or syntactic class that the token - belongs to. For example an end of sentence marker token might be implemented - with type "eos". The default token type is "word". -

- A Token can optionally have metadata (a.k.a. Payload) in the form of a variable - length byte array. Use {@link TermPositions#getPayloadLength()} and - {@link TermPositions#getPayload(byte[], int)} to retrieve the payloads from the index. - -

- -

NOTE: As of 2.9, Token implements all {@link Attribute} interfaces - that are part of core Lucene and can be found in the {@code tokenattributes} subpackage. - Even though it is not necessary to use Token anymore, with the new TokenStream API it can - be used as convenience class that implements all {@link Attribute}s, which is especially useful - to easily switch from the old to the new TokenStream API. - -

- -

Tokenizers and TokenFilters should try to re-use a Token - instance when possible for best performance, by - implementing the {@link TokenStream#incrementToken()} API. - Failing that, to create a new Token you should first use - one of the constructors that starts with null text. To load - the token from a char[] use {@link #copyBuffer(char[], int, int)}. - To load from a String use {@link #setEmpty} followed by {@link #append(CharSequence)} or {@link #append(CharSequence, int, int)}. - Alternatively you can get the Token's termBuffer by calling either {@link #buffer()}, - if you know that your text is shorter than the capacity of the termBuffer - or {@link #resizeBuffer(int)}, if there is any possibility - that you may need to grow the buffer. Fill in the characters of your term into this - buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string, - or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setLength(int)} to - set the length of the term text. See LUCENE-969 - for details.

-

Typical Token reuse patterns: -

- A few things to note: - -

-

- Please note: With Lucene 3.1, the {@linkplain #toString toString()} method had to be changed to match the - {@link CharSequence} interface introduced by the interface {@link org.apache.lucene.analysis.tokenattributes.CharTermAttribute}. - This method now only prints the term text, no additional information anymore. -

- @see org.apache.lucene.index.Payload -*/ -// TODO: change superclass to CharTermAttribute in 4.0! Maybe deprecate the whole class? -public class Token extends TermAttributeImpl - implements TypeAttribute, PositionIncrementAttribute, - FlagsAttribute, OffsetAttribute, PayloadAttribute { - - private int startOffset,endOffset; - private String type = DEFAULT_TYPE; - private int flags; - private Payload payload; - private int positionIncrement = 1; - - /** Constructs a Token will null text. */ - public Token() { - } - - /** Constructs a Token with null text and start & end - * offsets. - * @param start start offset in the source text - * @param end end offset in the source text */ - public Token(int start, int end) { - startOffset = start; - endOffset = end; - } - - /** Constructs a Token with null text and start & end - * offsets plus the Token type. - * @param start start offset in the source text - * @param end end offset in the source text - * @param typ the lexical type of this Token */ - public Token(int start, int end, String typ) { - startOffset = start; - endOffset = end; - type = typ; - } - - /** - * Constructs a Token with null text and start & end - * offsets plus flags. NOTE: flags is EXPERIMENTAL. - * @param start start offset in the source text - * @param end end offset in the source text - * @param flags The bits to set for this token - */ - public Token(int start, int end, int flags) { - startOffset = start; - endOffset = end; - this.flags = flags; - } - - /** Constructs a Token with the given term text, and start - * & end offsets. The type defaults to "word." - * NOTE: for better indexing speed you should - * instead use the char[] termBuffer methods to set the - * term text. - * @param text term text - * @param start start offset - * @param end end offset - */ - public Token(String text, int start, int end) { - append(text); - startOffset = start; - endOffset = end; - } - - /** Constructs a Token with the given text, start and end - * offsets, & type. NOTE: for better indexing - * speed you should instead use the char[] termBuffer - * methods to set the term text. - * @param text term text - * @param start start offset - * @param end end offset - * @param typ token type - */ - public Token(String text, int start, int end, String typ) { - append(text); - startOffset = start; - endOffset = end; - type = typ; - } - - /** - * Constructs a Token with the given text, start and end - * offsets, & type. NOTE: for better indexing - * speed you should instead use the char[] termBuffer - * methods to set the term text. - * @param text - * @param start - * @param end - * @param flags token type bits - */ - public Token(String text, int start, int end, int flags) { - append(text); - startOffset = start; - endOffset = end; - this.flags = flags; - } - - /** - * Constructs a Token with the given term buffer (offset - * & length), start and end - * offsets - * @param startTermBuffer - * @param termBufferOffset - * @param termBufferLength - * @param start - * @param end - */ - public Token(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end) { - copyBuffer(startTermBuffer, termBufferOffset, termBufferLength); - startOffset = start; - endOffset = end; - } - - /** Set the position increment. This determines the position of this token - * relative to the previous Token in a {@link TokenStream}, used in phrase - * searching. - * - *

The default value is one. - * - *

Some common uses for this are:

- * @param positionIncrement the distance from the prior term - * @see org.apache.lucene.index.TermPositions - */ - public void setPositionIncrement(int positionIncrement) { - if (positionIncrement < 0) - throw new IllegalArgumentException - ("Increment must be zero or greater: " + positionIncrement); - this.positionIncrement = positionIncrement; - } - - /** Returns the position increment of this Token. - * @see #setPositionIncrement - */ - public int getPositionIncrement() { - return positionIncrement; - } - - /** Returns this Token's starting offset, the position of the first character - corresponding to this token in the source text. - - Note that the difference between endOffset() and startOffset() may not be - equal to {@link #length}, as the term text may have been altered by a - stemmer or some other filter. */ - public final int startOffset() { - return startOffset; - } - - /** Set the starting offset. - @see #startOffset() */ - public void setStartOffset(int offset) { - this.startOffset = offset; - } - - /** Returns this Token's ending offset, one greater than the position of the - last character corresponding to this token in the source text. The length - of the token in the source text is (endOffset - startOffset). */ - public final int endOffset() { - return endOffset; - } - - /** Set the ending offset. - @see #endOffset() */ - public void setEndOffset(int offset) { - this.endOffset = offset; - } - - /** Set the starting and ending offset. - @see #startOffset() and #endOffset()*/ - public void setOffset(int startOffset, int endOffset) { - this.startOffset = startOffset; - this.endOffset = endOffset; - } - - /** Returns this Token's lexical type. Defaults to "word". */ - public final String type() { - return type; - } - - /** Set the lexical type. - @see #type() */ - public final void setType(String type) { - this.type = type; - } - - /** - *

- * - * Get the bitset for any bits that have been set. This is completely distinct from {@link #type()}, although they do share similar purposes. - * The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s. - * - * - * @return The bits - * @lucene.experimental While we think this is here to stay, we may want to change it to be a long. - */ - public int getFlags() { - return flags; - } - - /** - * @see #getFlags() - */ - public void setFlags(int flags) { - this.flags = flags; - } - - /** - * Returns this Token's payload. - */ - public Payload getPayload() { - return this.payload; - } - - /** - * Sets this Token's payload. - */ - public void setPayload(Payload payload) { - this.payload = payload; - } - - /** Resets the term text, payload, flags, and positionIncrement, - * startOffset, endOffset and token type to default. - */ - @Override - public void clear() { - super.clear(); - payload = null; - positionIncrement = 1; - flags = 0; - startOffset = endOffset = 0; - type = DEFAULT_TYPE; - } - - @Override - public Object clone() { - Token t = (Token)super.clone(); - // Do a deep clone - if (payload != null) { - t.payload = (Payload) payload.clone(); - } - return t; - } - - /** Makes a clone, but replaces the term buffer & - * start/end offset in the process. This is more - * efficient than doing a full clone (and then calling - * {@link #copyBuffer}) because it saves a wasted copy of the old - * termBuffer. */ - public Token clone(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) { - final Token t = new Token(newTermBuffer, newTermOffset, newTermLength, newStartOffset, newEndOffset); - t.positionIncrement = positionIncrement; - t.flags = flags; - t.type = type; - if (payload != null) - t.payload = (Payload) payload.clone(); - return t; - } - - @Override - public boolean equals(Object obj) { - if (obj == this) - return true; - - if (obj instanceof Token) { - final Token other = (Token) obj; - return (startOffset == other.startOffset && - endOffset == other.endOffset && - flags == other.flags && - positionIncrement == other.positionIncrement && - (type == null ? other.type == null : type.equals(other.type)) && - (payload == null ? other.payload == null : payload.equals(other.payload)) && - super.equals(obj) - ); - } else - return false; - } - - @Override - public int hashCode() { - int code = super.hashCode(); - code = code * 31 + startOffset; - code = code * 31 + endOffset; - code = code * 31 + flags; - code = code * 31 + positionIncrement; - if (type != null) - code = code * 31 + type.hashCode(); - if (payload != null) - code = code * 31 + payload.hashCode(); - return code; - } - - // like clear() but doesn't clear termBuffer/text - private void clearNoTermBuffer() { - payload = null; - positionIncrement = 1; - flags = 0; - startOffset = endOffset = 0; - type = DEFAULT_TYPE; - } - - /** Shorthand for calling {@link #clear}, - * {@link #copyBuffer(char[], int, int)}, - * {@link #setStartOffset}, - * {@link #setEndOffset}, - * {@link #setType} - * @return this Token instance */ - public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) { - clearNoTermBuffer(); - copyBuffer(newTermBuffer, newTermOffset, newTermLength); - payload = null; - positionIncrement = 1; - startOffset = newStartOffset; - endOffset = newEndOffset; - type = newType; - return this; - } - - /** Shorthand for calling {@link #clear}, - * {@link #copyBuffer(char[], int, int)}, - * {@link #setStartOffset}, - * {@link #setEndOffset} - * {@link #setType} on Token.DEFAULT_TYPE - * @return this Token instance */ - public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) { - clearNoTermBuffer(); - copyBuffer(newTermBuffer, newTermOffset, newTermLength); - startOffset = newStartOffset; - endOffset = newEndOffset; - type = DEFAULT_TYPE; - return this; - } - - /** Shorthand for calling {@link #clear}, - * {@link #append(CharSequence)}, - * {@link #setStartOffset}, - * {@link #setEndOffset} - * {@link #setType} - * @return this Token instance */ - public Token reinit(String newTerm, int newStartOffset, int newEndOffset, String newType) { - clear(); - append(newTerm); - startOffset = newStartOffset; - endOffset = newEndOffset; - type = newType; - return this; - } - - /** Shorthand for calling {@link #clear}, - * {@link #append(CharSequence, int, int)}, - * {@link #setStartOffset}, - * {@link #setEndOffset} - * {@link #setType} - * @return this Token instance */ - public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) { - clear(); - append(newTerm, newTermOffset, newTermOffset + newTermLength); - startOffset = newStartOffset; - endOffset = newEndOffset; - type = newType; - return this; - } - - /** Shorthand for calling {@link #clear}, - * {@link #append(CharSequence)}, - * {@link #setStartOffset}, - * {@link #setEndOffset} - * {@link #setType} on Token.DEFAULT_TYPE - * @return this Token instance */ - public Token reinit(String newTerm, int newStartOffset, int newEndOffset) { - clear(); - append(newTerm); - startOffset = newStartOffset; - endOffset = newEndOffset; - type = DEFAULT_TYPE; - return this; - } - - /** Shorthand for calling {@link #clear}, - * {@link #append(CharSequence, int, int)}, - * {@link #setStartOffset}, - * {@link #setEndOffset} - * {@link #setType} on Token.DEFAULT_TYPE - * @return this Token instance */ - public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) { - clear(); - append(newTerm, newTermOffset, newTermOffset + newTermLength); - startOffset = newStartOffset; - endOffset = newEndOffset; - type = DEFAULT_TYPE; - return this; - } - - /** - * Copy the prototype token's fields into this one. Note: Payloads are shared. - * @param prototype - */ - public void reinit(Token prototype) { - copyBuffer(prototype.buffer(), 0, prototype.length()); - positionIncrement = prototype.positionIncrement; - flags = prototype.flags; - startOffset = prototype.startOffset; - endOffset = prototype.endOffset; - type = prototype.type; - payload = prototype.payload; - } - - /** - * Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared. - * @param prototype - * @param newTerm - */ - public void reinit(Token prototype, String newTerm) { - setEmpty().append(newTerm); - positionIncrement = prototype.positionIncrement; - flags = prototype.flags; - startOffset = prototype.startOffset; - endOffset = prototype.endOffset; - type = prototype.type; - payload = prototype.payload; - } - - /** - * Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared. - * @param prototype - * @param newTermBuffer - * @param offset - * @param length - */ - public void reinit(Token prototype, char[] newTermBuffer, int offset, int length) { - copyBuffer(newTermBuffer, offset, length); - positionIncrement = prototype.positionIncrement; - flags = prototype.flags; - startOffset = prototype.startOffset; - endOffset = prototype.endOffset; - type = prototype.type; - payload = prototype.payload; - } - - @Override - public void copyTo(AttributeImpl target) { - if (target instanceof Token) { - final Token to = (Token) target; - to.reinit(this); - // reinit shares the payload, so clone it: - if (payload !=null) { - to.payload = (Payload) payload.clone(); - } - } else { - super.copyTo(target); - ((OffsetAttribute) target).setOffset(startOffset, endOffset); - ((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement); - ((PayloadAttribute) target).setPayload((payload == null) ? null : (Payload) payload.clone()); - ((FlagsAttribute) target).setFlags(flags); - ((TypeAttribute) target).setType(type); - } - } - - @Override - public void reflectWith(AttributeReflector reflector) { - super.reflectWith(reflector); - reflector.reflect(OffsetAttribute.class, "startOffset", startOffset); - reflector.reflect(OffsetAttribute.class, "endOffset", endOffset); - reflector.reflect(PositionIncrementAttribute.class, "positionIncrement", positionIncrement); - reflector.reflect(PayloadAttribute.class, "payload", payload); - reflector.reflect(FlagsAttribute.class, "flags", flags); - reflector.reflect(TypeAttribute.class, "type", type); - } - - /** Convenience factory that returns Token as implementation for the basic - * attributes and return the default impl (with "Impl" appended) for all other - * attributes. - * @since 3.0 - */ - public static final AttributeSource.AttributeFactory TOKEN_ATTRIBUTE_FACTORY = - new TokenAttributeFactory(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); - - /** Expert: Creates a TokenAttributeFactory returning {@link Token} as instance for the basic attributes - * and for all other attributes calls the given delegate factory. - * @since 3.0 - */ - public static final class TokenAttributeFactory extends AttributeSource.AttributeFactory { - - private final AttributeSource.AttributeFactory delegate; - - /** Expert: Creates an AttributeFactory returning {@link Token} as instance for the basic attributes - * and for all other attributes calls the given delegate factory. */ - public TokenAttributeFactory(AttributeSource.AttributeFactory delegate) { - this.delegate = delegate; - } - - @Override - public AttributeImpl createAttributeInstance(Class attClass) { - return attClass.isAssignableFrom(Token.class) - ? new Token() : delegate.createAttributeInstance(attClass); - } - - @Override - public boolean equals(Object other) { - if (this == other) return true; - if (other instanceof TokenAttributeFactory) { - final TokenAttributeFactory af = (TokenAttributeFactory) other; - return this.delegate.equals(af.delegate); - } - return false; - } - - @Override - public int hashCode() { - return delegate.hashCode() ^ 0x0a45aa31; - } - } - -}