lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/TokenStream.java

   1 package org.apache.lucene.analysis;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.io.Closeable;
  22 import java.lang.reflect.Modifier;
  23
  24 import org.apache.lucene.document.Document;
  25 import org.apache.lucene.document.Field;
  26 import org.apache.lucene.index.IndexWriter;
  27 import org.apache.lucene.util.Attribute;
  28 import org.apache.lucene.util.AttributeImpl;
  29 import org.apache.lucene.util.AttributeSource;
  30
  31 /**
  32  * A <code>TokenStream</code> enumerates the sequence of tokens, either from
  33  * {@link Field}s of a {@link Document} or from query text.
  34  * <p>
  35  * This is an abstract class; concrete subclasses are:
  36  * <ul>
  37  * <li>{@link Tokenizer}, a <code>TokenStream</code> whose input is a Reader; and
  38  * <li>{@link TokenFilter}, a <code>TokenStream</code> whose input is another
  39  * <code>TokenStream</code>.
  40  * </ul>
  41  * A new <code>TokenStream</code> API has been introduced with Lucene 2.9. This API
  42  * has moved from being {@link Token}-based to {@link Attribute}-based. While
  43  * {@link Token} still exists in 2.9 as a convenience class, the preferred way
  44  * to store the information of a {@link Token} is to use {@link AttributeImpl}s.
  45  * <p>
  46  * <code>TokenStream</code> now extends {@link AttributeSource}, which provides
  47  * access to all of the token {@link Attribute}s for the <code>TokenStream</code>.
  48  * Note that only one instance per {@link AttributeImpl} is created and reused
  49  * for every token. This approach reduces object creation and allows local
  50  * caching of references to the {@link AttributeImpl}s. See
  51  * {@link #incrementToken()} for further details.
  52  * <p>
  53  * <b>The workflow of the new <code>TokenStream</code> API is as follows:</b>
  54  * <ol>
  55  * <li>Instantiation of <code>TokenStream</code>/{@link TokenFilter}s which add/get
  56  * attributes to/from the {@link AttributeSource}.
  57  * <li>The consumer calls {@link TokenStream#reset()}.
  58  * <li>The consumer retrieves attributes from the stream and stores local
  59  * references to all attributes it wants to access.
  60  * <li>The consumer calls {@link #incrementToken()} until it returns false
  61  * consuming the attributes after each call.
  62  * <li>The consumer calls {@link #end()} so that any end-of-stream operations
  63  * can be performed.
  64  * <li>The consumer calls {@link #close()} to release any resource when finished
  65  * using the <code>TokenStream</code>.
  66  * </ol>
  67  * To make sure that filters and consumers know which attributes are available,
  68  * the attributes must be added during instantiation. Filters and consumers are
  69  * not required to check for availability of attributes in
  70  * {@link #incrementToken()}.
  71  * <p>
  72  * You can find some example code for the new API in the analysis package level
  73  * Javadoc.
  74  * <p>
  75  * Sometimes it is desirable to capture a current state of a <code>TokenStream</code>,
  76  * e.g., for buffering purposes (see {@link CachingTokenFilter},
  77  * {@link TeeSinkTokenFilter}). For this usecase
  78  * {@link AttributeSource#captureState} and {@link AttributeSource#restoreState}
  79  * can be used.
  80  * <p>The {@code TokenStream}-API in Lucene is based on the decorator pattern.
  81  * Therefore all non-abstract subclasses must be final or have at least a final
  82  * implementation of {@link #incrementToken}! This is checked when Java
  83  * assertions are enabled.
  84  */
  85 public abstract class TokenStream extends AttributeSource implements Closeable {
  86
  87   /**
  88    * A TokenStream using the default attribute factory.
  89    */
  90   protected TokenStream() {
  91     super();
  92     assert assertFinal();
  93   }
  94
  95   /**
  96    * A TokenStream that uses the same attributes as the supplied one.
  97    */
  98   protected TokenStream(AttributeSource input) {
  99     super(input);
 100     assert assertFinal();
 101   }
 102
 103   /**
 104    * A TokenStream using the supplied AttributeFactory for creating new {@link Attribute} instances.
 105    */
 106   protected TokenStream(AttributeFactory factory) {
 107     super(factory);
 108     assert assertFinal();
 109   }
 110
 111   private boolean assertFinal() {
 112     try {
 113       final Class<?> clazz = getClass();
 114       assert clazz.isAnonymousClass() ||
 115         (clazz.getModifiers() & (Modifier.FINAL | Modifier.PRIVATE)) != 0 ||
 116         Modifier.isFinal(clazz.getMethod("incrementToken").getModifiers()) :
 117         "TokenStream implementation classes or at least their incrementToken() implementation must be final";
 118       return true;
 119     } catch (NoSuchMethodException nsme) {
 120       return false;
 121     }
 122   }
 123
 124   /**
 125    * Consumers (i.e., {@link IndexWriter}) use this method to advance the stream to
 126    * the next token. Implementing classes must implement this method and update
 127    * the appropriate {@link AttributeImpl}s with the attributes of the next
 128    * token.
 129    * <P>
 130    * The producer must make no assumptions about the attributes after the method
 131    * has been returned: the caller may arbitrarily change it. If the producer
 132    * needs to preserve the state for subsequent calls, it can use
 133    * {@link #captureState} to create a copy of the current attribute state.
 134    * <p>
 135    * This method is called for every token of a document, so an efficient
 136    * implementation is crucial for good performance. To avoid calls to
 137    * {@link #addAttribute(Class)} and {@link #getAttribute(Class)},
 138    * references to all {@link AttributeImpl}s that this stream uses should be
 139    * retrieved during instantiation.
 140    * <p>
 141    * To ensure that filters and consumers know which attributes are available,
 142    * the attributes must be added during instantiation. Filters and consumers
 143    * are not required to check for availability of attributes in
 144    * {@link #incrementToken()}.
 145    *
 146    * @return false for end of stream; true otherwise
 147    */
 148   public abstract boolean incrementToken() throws IOException;
 149
 150   /**
 151    * This method is called by the consumer after the last token has been
 152    * consumed, after {@link #incrementToken()} returned <code>false</code>
 153    * (using the new <code>TokenStream</code> API). Streams implementing the old API
 154    * should upgrade to use this feature.
 155    * <p/>
 156    * This method can be used to perform any end-of-stream operations, such as
 157    * setting the final offset of a stream. The final offset of a stream might
 158    * differ from the offset of the last token eg in case one or more whitespaces
 159    * followed after the last token, but a {@link WhitespaceTokenizer} was used.
 160    *
 161    * @throws IOException
 162    */
 163   public void end() throws IOException {
 164     // do nothing by default
 165   }
 166
 167   /**
 168    * Resets this stream to the beginning. This is an optional operation, so
 169    * subclasses may or may not implement this method. {@link #reset()} is not needed for
 170    * the standard indexing process. However, if the tokens of a
 171    * <code>TokenStream</code> are intended to be consumed more than once, it is
 172    * necessary to implement {@link #reset()}. Note that if your TokenStream
 173    * caches tokens and feeds them back again after a reset, it is imperative
 174    * that you clone the tokens when you store them away (on the first pass) as
 175    * well as when you return them (on future passes after {@link #reset()}).
 176    */
 177   public void reset() throws IOException {}
 178
 179   /** Releases resources associated with this stream. */
 180   public void close() throws IOException {}
 181
 182 }