lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java

   1 package org.apache.lucene.analysis.ngram;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import org.apache.lucene.analysis.Tokenizer;
  21 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  22 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  23 import org.apache.lucene.util.AttributeSource;
  24
  25 import java.io.IOException;
  26 import java.io.Reader;
  27
  28 /**
  29  * Tokenizes the input into n-grams of the given size(s).
  30  */
  31 public final class NGramTokenizer extends Tokenizer {
  32   public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
  33   public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
  34
  35   private int minGram, maxGram;
  36   private int gramSize;
  37   private int pos = 0;
  38   private int inLen;
  39   private String inStr;
  40   private boolean started = false;
  41
  42   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  43   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  44
  45   /**
  46    * Creates NGramTokenizer with given min and max n-grams.
  47    * @param input {@link Reader} holding the input to be tokenized
  48    * @param minGram the smallest n-gram to generate
  49    * @param maxGram the largest n-gram to generate
  50    */
  51   public NGramTokenizer(Reader input, int minGram, int maxGram) {
  52     super(input);
  53     init(minGram, maxGram);
  54   }
  55
  56   /**
  57    * Creates NGramTokenizer with given min and max n-grams.
  58    * @param source {@link AttributeSource} to use
  59    * @param input {@link Reader} holding the input to be tokenized
  60    * @param minGram the smallest n-gram to generate
  61    * @param maxGram the largest n-gram to generate
  62    */
  63   public NGramTokenizer(AttributeSource source, Reader input, int minGram, int maxGram) {
  64     super(source, input);
  65     init(minGram, maxGram);
  66   }
  67
  68   /**
  69    * Creates NGramTokenizer with given min and max n-grams.
  70    * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
  71    * @param input {@link Reader} holding the input to be tokenized
  72    * @param minGram the smallest n-gram to generate
  73    * @param maxGram the largest n-gram to generate
  74    */
  75   public NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram) {
  76     super(factory, input);
  77     init(minGram, maxGram);
  78   }
  79
  80   /**
  81    * Creates NGramTokenizer with default min and max n-grams.
  82    * @param input {@link Reader} holding the input to be tokenized
  83    */
  84   public NGramTokenizer(Reader input) {
  85     this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
  86   }
  87
  88   private void init(int minGram, int maxGram) {
  89     if (minGram < 1) {
  90       throw new IllegalArgumentException("minGram must be greater than zero");
  91     }
  92     if (minGram > maxGram) {
  93       throw new IllegalArgumentException("minGram must not be greater than maxGram");
  94     }
  95     this.minGram = minGram;
  96     this.maxGram = maxGram;
  97   }
  98
  99   /** Returns the next token in the stream, or null at EOS. */
 100   @Override
 101   public final boolean incrementToken() throws IOException {
 102     clearAttributes();
 103     if (!started) {
 104       started = true;
 105       gramSize = minGram;
 106       char[] chars = new char[1024];
 107       input.read(chars);
 108       inStr = new String(chars).trim();  // remove any trailing empty strings
 109       inLen = inStr.length();
 110     }
 111
 112     if (pos+gramSize > inLen) {            // if we hit the end of the string
 113       pos = 0;                           // reset to beginning of string
 114       gramSize++;                        // increase n-gram size
 115       if (gramSize > maxGram)            // we are done
 116         return false;
 117       if (pos+gramSize > inLen)
 118         return false;
 119     }
 120
 121     int oldPos = pos;
 122     pos++;
 123     termAtt.setEmpty().append(inStr, oldPos, oldPos+gramSize);
 124     offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize));
 125     return true;
 126   }
 127
 128   @Override
 129   public final void end() {
 130     // set final offset
 131     final int finalOffset = inLen;
 132     this.offsetAtt.setOffset(finalOffset, finalOffset);
 133   }
 134
 135   @Override
 136   public void reset(Reader input) throws IOException {
 137     super.reset(input);
 138     reset();
 139   }
 140
 141   @Override
 142   public void reset() throws IOException {
 143     super.reset();
 144     started = false;
 145     pos = 0;
 146   }
 147 }