1 package org.apache.lucene.analysis;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import org.apache.lucene.analysis.tokenattributes.TermAttributeImpl;
21 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
22 import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
23 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
24 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
25 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
26 import org.apache.lucene.index.Payload;
27 import org.apache.lucene.index.TermPositions; // for javadoc
28 import org.apache.lucene.util.Attribute;
29 import org.apache.lucene.util.AttributeSource;
30 import org.apache.lucene.util.AttributeImpl;
31 import org.apache.lucene.util.AttributeReflector;
34 A Token is an occurrence of a term from the text of a field. It consists of
35 a term's text, the start and end offset of the term in the text of the field,
38 The start and end offsets permit applications to re-associate a token with
39 its source text, e.g., to display highlighted query terms in a document
40 browser, or to show matching text fragments in a <abbr title="KeyWord In Context">KWIC</abbr>
43 The type is a string, assigned by a lexical analyzer
44 (a.k.a. tokenizer), naming the lexical or syntactic class that the token
45 belongs to. For example an end of sentence marker token might be implemented
46 with type "eos". The default token type is "word".
48 A Token can optionally have metadata (a.k.a. Payload) in the form of a variable
49 length byte array. Use {@link TermPositions#getPayloadLength()} and
50 {@link TermPositions#getPayload(byte[], int)} to retrieve the payloads from the index.
54 <p><b>NOTE:</b> As of 2.9, Token implements all {@link Attribute} interfaces
55 that are part of core Lucene and can be found in the {@code tokenattributes} subpackage.
56 Even though it is not necessary to use Token anymore, with the new TokenStream API it can
57 be used as convenience class that implements all {@link Attribute}s, which is especially useful
58 to easily switch from the old to the new TokenStream API.
62 <p>Tokenizers and TokenFilters should try to re-use a Token
63 instance when possible for best performance, by
64 implementing the {@link TokenStream#incrementToken()} API.
65 Failing that, to create a new Token you should first use
66 one of the constructors that starts with null text. To load
67 the token from a char[] use {@link #copyBuffer(char[], int, int)}.
68 To load from a String use {@link #setEmpty} followed by {@link #append(CharSequence)} or {@link #append(CharSequence, int, int)}.
69 Alternatively you can get the Token's termBuffer by calling either {@link #buffer()},
70 if you know that your text is shorter than the capacity of the termBuffer
71 or {@link #resizeBuffer(int)}, if there is any possibility
72 that you may need to grow the buffer. Fill in the characters of your term into this
73 buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string,
74 or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setLength(int)} to
75 set the length of the term text. See <a target="_top"
76 href="https://issues.apache.org/jira/browse/LUCENE-969">LUCENE-969</a>
78 <p>Typical Token reuse patterns:
80 <li> Copying text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
82 return reusableToken.reinit(string, startOffset, endOffset[, type]);
85 <li> Copying some text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
87 return reusableToken.reinit(string, 0, string.length(), startOffset, endOffset[, type]);
91 <li> Copying text from char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
93 return reusableToken.reinit(buffer, 0, buffer.length, startOffset, endOffset[, type]);
96 <li> Copying some text from a char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
98 return reusableToken.reinit(buffer, start, end - start, startOffset, endOffset[, type]);
101 <li> Copying from one one Token to another (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
103 return reusableToken.reinit(source.buffer(), 0, source.length(), source.startOffset(), source.endOffset()[, source.type()]);
107 A few things to note:
109 <li>clear() initializes all of the fields to default values. This was changed in contrast to Lucene 2.4, but should affect no one.</li>
110 <li>Because <code>TokenStreams</code> can be chained, one cannot assume that the <code>Token's</code> current type is correct.</li>
111 <li>The startOffset and endOffset represent the start and offset in the source text, so be careful in adjusting them.</li>
112 <li>When caching a reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again.</li>
116 <b>Please note:</b> With Lucene 3.1, the <code>{@linkplain #toString toString()}</code> method had to be changed to match the
117 {@link CharSequence} interface introduced by the interface {@link org.apache.lucene.analysis.tokenattributes.CharTermAttribute}.
118 This method now only prints the term text, no additional information anymore.
120 @see org.apache.lucene.index.Payload
122 // TODO: change superclass to CharTermAttribute in 4.0! Maybe deprecate the whole class?
123 public class Token extends TermAttributeImpl
124 implements TypeAttribute, PositionIncrementAttribute,
125 FlagsAttribute, OffsetAttribute, PayloadAttribute {
127 private int startOffset,endOffset;
128 private String type = DEFAULT_TYPE;
130 private Payload payload;
131 private int positionIncrement = 1;
133 /** Constructs a Token will null text. */
137 /** Constructs a Token with null text and start & end
139 * @param start start offset in the source text
140 * @param end end offset in the source text */
141 public Token(int start, int end) {
146 /** Constructs a Token with null text and start & end
147 * offsets plus the Token type.
148 * @param start start offset in the source text
149 * @param end end offset in the source text
150 * @param typ the lexical type of this Token */
151 public Token(int start, int end, String typ) {
158 * Constructs a Token with null text and start & end
159 * offsets plus flags. NOTE: flags is EXPERIMENTAL.
160 * @param start start offset in the source text
161 * @param end end offset in the source text
162 * @param flags The bits to set for this token
164 public Token(int start, int end, int flags) {
170 /** Constructs a Token with the given term text, and start
171 * & end offsets. The type defaults to "word."
172 * <b>NOTE:</b> for better indexing speed you should
173 * instead use the char[] termBuffer methods to set the
175 * @param text term text
176 * @param start start offset
177 * @param end end offset
179 public Token(String text, int start, int end) {
185 /** Constructs a Token with the given text, start and end
186 * offsets, & type. <b>NOTE:</b> for better indexing
187 * speed you should instead use the char[] termBuffer
188 * methods to set the term text.
189 * @param text term text
190 * @param start start offset
191 * @param end end offset
192 * @param typ token type
194 public Token(String text, int start, int end, String typ) {
202 * Constructs a Token with the given text, start and end
203 * offsets, & type. <b>NOTE:</b> for better indexing
204 * speed you should instead use the char[] termBuffer
205 * methods to set the term text.
209 * @param flags token type bits
211 public Token(String text, int start, int end, int flags) {
219 * Constructs a Token with the given term buffer (offset
220 * & length), start and end
222 * @param startTermBuffer
223 * @param termBufferOffset
224 * @param termBufferLength
228 public Token(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end) {
229 copyBuffer(startTermBuffer, termBufferOffset, termBufferLength);
234 /** Set the position increment. This determines the position of this token
235 * relative to the previous Token in a {@link TokenStream}, used in phrase
238 * <p>The default value is one.
240 * <p>Some common uses for this are:<ul>
242 * <li>Set it to zero to put multiple terms in the same position. This is
243 * useful if, e.g., a word has multiple stems. Searches for phrases
244 * including either stem will match. In this case, all but the first stem's
245 * increment should be set to zero: the increment of the first instance
246 * should be one. Repeating a token with an increment of zero can also be
247 * used to boost the scores of matches on that token.
249 * <li>Set it to values greater than one to inhibit exact phrase matches.
250 * If, for example, one does not want phrases to match across removed stop
251 * words, then one could build a stop word filter that removes stop words and
252 * also sets the increment to the number of stop words removed before each
253 * non-stop word. Then exact phrase queries will only match when the terms
254 * occur with no intervening stop words.
257 * @param positionIncrement the distance from the prior term
258 * @see org.apache.lucene.index.TermPositions
260 public void setPositionIncrement(int positionIncrement) {
261 if (positionIncrement < 0)
262 throw new IllegalArgumentException
263 ("Increment must be zero or greater: " + positionIncrement);
264 this.positionIncrement = positionIncrement;
267 /** Returns the position increment of this Token.
268 * @see #setPositionIncrement
270 public int getPositionIncrement() {
271 return positionIncrement;
274 /** Returns this Token's starting offset, the position of the first character
275 corresponding to this token in the source text.
277 Note that the difference between endOffset() and startOffset() may not be
278 equal to {@link #length}, as the term text may have been altered by a
279 stemmer or some other filter. */
280 public final int startOffset() {
284 /** Set the starting offset.
285 @see #startOffset() */
286 public void setStartOffset(int offset) {
287 this.startOffset = offset;
290 /** Returns this Token's ending offset, one greater than the position of the
291 last character corresponding to this token in the source text. The length
292 of the token in the source text is (endOffset - startOffset). */
293 public final int endOffset() {
297 /** Set the ending offset.
299 public void setEndOffset(int offset) {
300 this.endOffset = offset;
303 /** Set the starting and ending offset.
304 @see #startOffset() and #endOffset()*/
305 public void setOffset(int startOffset, int endOffset) {
306 this.startOffset = startOffset;
307 this.endOffset = endOffset;
310 /** Returns this Token's lexical type. Defaults to "word". */
311 public final String type() {
315 /** Set the lexical type.
317 public final void setType(String type) {
324 * Get the bitset for any bits that have been set. This is completely distinct from {@link #type()}, although they do share similar purposes.
325 * The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s.
329 * @lucene.experimental While we think this is here to stay, we may want to change it to be a long.
331 public int getFlags() {
338 public void setFlags(int flags) {
343 * Returns this Token's payload.
345 public Payload getPayload() {
350 * Sets this Token's payload.
352 public void setPayload(Payload payload) {
353 this.payload = payload;
356 /** Resets the term text, payload, flags, and positionIncrement,
357 * startOffset, endOffset and token type to default.
360 public void clear() {
363 positionIncrement = 1;
365 startOffset = endOffset = 0;
370 public Object clone() {
371 Token t = (Token)super.clone();
373 if (payload != null) {
374 t.payload = (Payload) payload.clone();
379 /** Makes a clone, but replaces the term buffer &
380 * start/end offset in the process. This is more
381 * efficient than doing a full clone (and then calling
382 * {@link #copyBuffer}) because it saves a wasted copy of the old
384 public Token clone(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
385 final Token t = new Token(newTermBuffer, newTermOffset, newTermLength, newStartOffset, newEndOffset);
386 t.positionIncrement = positionIncrement;
390 t.payload = (Payload) payload.clone();
395 public boolean equals(Object obj) {
399 if (obj instanceof Token) {
400 final Token other = (Token) obj;
401 return (startOffset == other.startOffset &&
402 endOffset == other.endOffset &&
403 flags == other.flags &&
404 positionIncrement == other.positionIncrement &&
405 (type == null ? other.type == null : type.equals(other.type)) &&
406 (payload == null ? other.payload == null : payload.equals(other.payload)) &&
414 public int hashCode() {
415 int code = super.hashCode();
416 code = code * 31 + startOffset;
417 code = code * 31 + endOffset;
418 code = code * 31 + flags;
419 code = code * 31 + positionIncrement;
421 code = code * 31 + type.hashCode();
423 code = code * 31 + payload.hashCode();
427 // like clear() but doesn't clear termBuffer/text
428 private void clearNoTermBuffer() {
430 positionIncrement = 1;
432 startOffset = endOffset = 0;
436 /** Shorthand for calling {@link #clear},
437 * {@link #copyBuffer(char[], int, int)},
438 * {@link #setStartOffset},
439 * {@link #setEndOffset},
441 * @return this Token instance */
442 public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) {
444 copyBuffer(newTermBuffer, newTermOffset, newTermLength);
446 positionIncrement = 1;
447 startOffset = newStartOffset;
448 endOffset = newEndOffset;
453 /** Shorthand for calling {@link #clear},
454 * {@link #copyBuffer(char[], int, int)},
455 * {@link #setStartOffset},
456 * {@link #setEndOffset}
457 * {@link #setType} on Token.DEFAULT_TYPE
458 * @return this Token instance */
459 public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
461 copyBuffer(newTermBuffer, newTermOffset, newTermLength);
462 startOffset = newStartOffset;
463 endOffset = newEndOffset;
468 /** Shorthand for calling {@link #clear},
469 * {@link #append(CharSequence)},
470 * {@link #setStartOffset},
471 * {@link #setEndOffset}
473 * @return this Token instance */
474 public Token reinit(String newTerm, int newStartOffset, int newEndOffset, String newType) {
477 startOffset = newStartOffset;
478 endOffset = newEndOffset;
483 /** Shorthand for calling {@link #clear},
484 * {@link #append(CharSequence, int, int)},
485 * {@link #setStartOffset},
486 * {@link #setEndOffset}
488 * @return this Token instance */
489 public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) {
491 append(newTerm, newTermOffset, newTermOffset + newTermLength);
492 startOffset = newStartOffset;
493 endOffset = newEndOffset;
498 /** Shorthand for calling {@link #clear},
499 * {@link #append(CharSequence)},
500 * {@link #setStartOffset},
501 * {@link #setEndOffset}
502 * {@link #setType} on Token.DEFAULT_TYPE
503 * @return this Token instance */
504 public Token reinit(String newTerm, int newStartOffset, int newEndOffset) {
507 startOffset = newStartOffset;
508 endOffset = newEndOffset;
513 /** Shorthand for calling {@link #clear},
514 * {@link #append(CharSequence, int, int)},
515 * {@link #setStartOffset},
516 * {@link #setEndOffset}
517 * {@link #setType} on Token.DEFAULT_TYPE
518 * @return this Token instance */
519 public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
521 append(newTerm, newTermOffset, newTermOffset + newTermLength);
522 startOffset = newStartOffset;
523 endOffset = newEndOffset;
529 * Copy the prototype token's fields into this one. Note: Payloads are shared.
532 public void reinit(Token prototype) {
533 copyBuffer(prototype.buffer(), 0, prototype.length());
534 positionIncrement = prototype.positionIncrement;
535 flags = prototype.flags;
536 startOffset = prototype.startOffset;
537 endOffset = prototype.endOffset;
538 type = prototype.type;
539 payload = prototype.payload;
543 * Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared.
547 public void reinit(Token prototype, String newTerm) {
548 setEmpty().append(newTerm);
549 positionIncrement = prototype.positionIncrement;
550 flags = prototype.flags;
551 startOffset = prototype.startOffset;
552 endOffset = prototype.endOffset;
553 type = prototype.type;
554 payload = prototype.payload;
558 * Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared.
560 * @param newTermBuffer
564 public void reinit(Token prototype, char[] newTermBuffer, int offset, int length) {
565 copyBuffer(newTermBuffer, offset, length);
566 positionIncrement = prototype.positionIncrement;
567 flags = prototype.flags;
568 startOffset = prototype.startOffset;
569 endOffset = prototype.endOffset;
570 type = prototype.type;
571 payload = prototype.payload;
575 public void copyTo(AttributeImpl target) {
576 if (target instanceof Token) {
577 final Token to = (Token) target;
579 // reinit shares the payload, so clone it:
580 if (payload !=null) {
581 to.payload = (Payload) payload.clone();
584 super.copyTo(target);
585 ((OffsetAttribute) target).setOffset(startOffset, endOffset);
586 ((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement);
587 ((PayloadAttribute) target).setPayload((payload == null) ? null : (Payload) payload.clone());
588 ((FlagsAttribute) target).setFlags(flags);
589 ((TypeAttribute) target).setType(type);
594 public void reflectWith(AttributeReflector reflector) {
595 super.reflectWith(reflector);
596 reflector.reflect(OffsetAttribute.class, "startOffset", startOffset);
597 reflector.reflect(OffsetAttribute.class, "endOffset", endOffset);
598 reflector.reflect(PositionIncrementAttribute.class, "positionIncrement", positionIncrement);
599 reflector.reflect(PayloadAttribute.class, "payload", payload);
600 reflector.reflect(FlagsAttribute.class, "flags", flags);
601 reflector.reflect(TypeAttribute.class, "type", type);
604 /** Convenience factory that returns <code>Token</code> as implementation for the basic
605 * attributes and return the default impl (with "Impl" appended) for all other
609 public static final AttributeSource.AttributeFactory TOKEN_ATTRIBUTE_FACTORY =
610 new TokenAttributeFactory(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
612 /** <b>Expert:</b> Creates a TokenAttributeFactory returning {@link Token} as instance for the basic attributes
613 * and for all other attributes calls the given delegate factory.
616 public static final class TokenAttributeFactory extends AttributeSource.AttributeFactory {
618 private final AttributeSource.AttributeFactory delegate;
620 /** <b>Expert</b>: Creates an AttributeFactory returning {@link Token} as instance for the basic attributes
621 * and for all other attributes calls the given delegate factory. */
622 public TokenAttributeFactory(AttributeSource.AttributeFactory delegate) {
623 this.delegate = delegate;
627 public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
628 return attClass.isAssignableFrom(Token.class)
629 ? new Token() : delegate.createAttributeInstance(attClass);
633 public boolean equals(Object other) {
634 if (this == other) return true;
635 if (other instanceof TokenAttributeFactory) {
636 final TokenAttributeFactory af = (TokenAttributeFactory) other;
637 return this.delegate.equals(af.delegate);
643 public int hashCode() {
644 return delegate.hashCode() ^ 0x0a45aa31;