X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/analysis/TokenStream.java diff --git a/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/analysis/TokenStream.java b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/analysis/TokenStream.java new file mode 100644 index 0000000..d03d9ab --- /dev/null +++ b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/analysis/TokenStream.java @@ -0,0 +1,184 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Closeable; +import java.lang.reflect.Modifier; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeImpl; +import org.apache.lucene.util.AttributeSource; + +/** + * A TokenStream enumerates the sequence of tokens, either from + * {@link Field}s of a {@link Document} or from query text. + *

+ * This is an abstract class; concrete subclasses are: + *

+ * A new TokenStream API has been introduced with Lucene 2.9. This API + * has moved from being {@link Token}-based to {@link Attribute}-based. While + * {@link Token} still exists in 2.9 as a convenience class, the preferred way + * to store the information of a {@link Token} is to use {@link AttributeImpl}s. + *

+ * TokenStream now extends {@link AttributeSource}, which provides + * access to all of the token {@link Attribute}s for the TokenStream. + * Note that only one instance per {@link AttributeImpl} is created and reused + * for every token. This approach reduces object creation and allows local + * caching of references to the {@link AttributeImpl}s. See + * {@link #incrementToken()} for further details. + *

+ * The workflow of the new TokenStream API is as follows: + *

    + *
  1. Instantiation of TokenStream/{@link TokenFilter}s which add/get + * attributes to/from the {@link AttributeSource}. + *
  2. The consumer calls {@link TokenStream#reset()}. + *
  3. The consumer retrieves attributes from the stream and stores local + * references to all attributes it wants to access. + *
  4. The consumer calls {@link #incrementToken()} until it returns false + * consuming the attributes after each call. + *
  5. The consumer calls {@link #end()} so that any end-of-stream operations + * can be performed. + *
  6. The consumer calls {@link #close()} to release any resource when finished + * using the TokenStream. + *
+ * To make sure that filters and consumers know which attributes are available, + * the attributes must be added during instantiation. Filters and consumers are + * not required to check for availability of attributes in + * {@link #incrementToken()}. + *

+ * You can find some example code for the new API in the analysis package level + * Javadoc. + *

+ * Sometimes it is desirable to capture a current state of a TokenStream, + * e.g., for buffering purposes (see {@link CachingTokenFilter}, + * {@link TeeSinkTokenFilter}). For this usecase + * {@link AttributeSource#captureState} and {@link AttributeSource#restoreState} + * can be used. + *

The {@code TokenStream}-API in Lucene is based on the decorator pattern. + * Therefore all non-abstract subclasses must be final or have at least a final + * implementation of {@link #incrementToken}! This is checked when Java + * assertions are enabled. + */ +public abstract class TokenStream extends AttributeSource implements Closeable { + + /** + * A TokenStream using the default attribute factory. + */ + protected TokenStream() { + super(); + assert assertFinal(); + } + + /** + * A TokenStream that uses the same attributes as the supplied one. + */ + protected TokenStream(AttributeSource input) { + super(input); + assert assertFinal(); + } + + /** + * A TokenStream using the supplied AttributeFactory for creating new {@link Attribute} instances. + */ + protected TokenStream(AttributeFactory factory) { + super(factory); + assert assertFinal(); + } + + private boolean assertFinal() { + try { + final Class clazz = getClass(); + if (!clazz.desiredAssertionStatus()) + return true; + assert clazz.isAnonymousClass() || + (clazz.getModifiers() & (Modifier.FINAL | Modifier.PRIVATE)) != 0 || + Modifier.isFinal(clazz.getMethod("incrementToken").getModifiers()) : + "TokenStream implementation classes or at least their incrementToken() implementation must be final"; + return true; + } catch (NoSuchMethodException nsme) { + return false; + } + } + + /** + * Consumers (i.e., {@link IndexWriter}) use this method to advance the stream to + * the next token. Implementing classes must implement this method and update + * the appropriate {@link AttributeImpl}s with the attributes of the next + * token. + *

+ * The producer must make no assumptions about the attributes after the method + * has been returned: the caller may arbitrarily change it. If the producer + * needs to preserve the state for subsequent calls, it can use + * {@link #captureState} to create a copy of the current attribute state. + *

+ * This method is called for every token of a document, so an efficient + * implementation is crucial for good performance. To avoid calls to + * {@link #addAttribute(Class)} and {@link #getAttribute(Class)}, + * references to all {@link AttributeImpl}s that this stream uses should be + * retrieved during instantiation. + *

+ * To ensure that filters and consumers know which attributes are available, + * the attributes must be added during instantiation. Filters and consumers + * are not required to check for availability of attributes in + * {@link #incrementToken()}. + * + * @return false for end of stream; true otherwise + */ + public abstract boolean incrementToken() throws IOException; + + /** + * This method is called by the consumer after the last token has been + * consumed, after {@link #incrementToken()} returned false + * (using the new TokenStream API). Streams implementing the old API + * should upgrade to use this feature. + *

+ * This method can be used to perform any end-of-stream operations, such as + * setting the final offset of a stream. The final offset of a stream might + * differ from the offset of the last token eg in case one or more whitespaces + * followed after the last token, but a {@link WhitespaceTokenizer} was used. + * + * @throws IOException + */ + public void end() throws IOException { + // do nothing by default + } + + /** + * Resets this stream to the beginning. This is an optional operation, so + * subclasses may or may not implement this method. {@link #reset()} is not needed for + * the standard indexing process. However, if the tokens of a + * TokenStream are intended to be consumed more than once, it is + * necessary to implement {@link #reset()}. Note that if your TokenStream + * caches tokens and feeds them back again after a reset, it is imperative + * that you clone the tokens when you store them away (on the first pass) as + * well as when you return them (on future passes after {@link #reset()}). + */ + public void reset() throws IOException {} + + /** Releases resources associated with this stream. */ + public void close() throws IOException {} + +}