lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/Tokenizer.java

   1 package org.apache.lucene.analysis;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import org.apache.lucene.util.AttributeSource;
  21
  22 import java.io.Reader;
  23 import java.io.IOException;
  24
  25 /** A Tokenizer is a TokenStream whose input is a Reader.
  26   <p>
  27   This is an abstract class; subclasses must override {@link #incrementToken()}
  28   <p>
  29   NOTE: Subclasses overriding {@link #incrementToken()} must
  30   call {@link AttributeSource#clearAttributes()} before
  31   setting attributes.
  32  */
  33 public abstract class Tokenizer extends TokenStream {
  34   /** The text source for this Tokenizer. */
  35   protected Reader input;
  36
  37   /** Construct a tokenizer with null input. */
  38   protected Tokenizer() {}
  39
  40   /** Construct a token stream processing the given input. */
  41   protected Tokenizer(Reader input) {
  42     this.input = CharReader.get(input);
  43   }
  44
  45   /** Construct a tokenizer with null input using the given AttributeFactory. */
  46   protected Tokenizer(AttributeFactory factory) {
  47     super(factory);
  48   }
  49
  50   /** Construct a token stream processing the given input using the given AttributeFactory. */
  51   protected Tokenizer(AttributeFactory factory, Reader input) {
  52     super(factory);
  53     this.input = CharReader.get(input);
  54   }
  55
  56   /** Construct a token stream processing the given input using the given AttributeSource. */
  57   protected Tokenizer(AttributeSource source) {
  58     super(source);
  59   }
  60
  61   /** Construct a token stream processing the given input using the given AttributeSource. */
  62   protected Tokenizer(AttributeSource source, Reader input) {
  63     super(source);
  64     this.input = CharReader.get(input);
  65   }
  66
  67   /** By default, closes the input Reader. */
  68   @Override
  69   public void close() throws IOException {
  70     if (input != null) {
  71       input.close();
  72       // LUCENE-2387: don't hold onto Reader after close, so
  73       // GC can reclaim
  74       input = null;
  75     }
  76   }
  77
  78   /** Return the corrected offset. If {@link #input} is a {@link CharStream} subclass
  79    * this method calls {@link CharStream#correctOffset}, else returns <code>currentOff</code>.
  80    * @param currentOff offset as seen in the output
  81    * @return corrected offset based on the input
  82    * @see CharStream#correctOffset
  83    */
  84   protected final int correctOffset(int currentOff) {
  85     return (input instanceof CharStream) ? ((CharStream) input).correctOffset(currentOff) : currentOff;
  86   }
  87
  88   /** Expert: Reset the tokenizer to a new reader.  Typically, an
  89    *  analyzer (in its reusableTokenStream method) will use
  90    *  this to re-use a previously created tokenizer. */
  91   public void reset(Reader input) throws IOException {
  92     this.input = input;
  93   }
  94 }
  95