lucene-java-3.4.0/lucene/backwards/src/test-framework/org/apache/lucene/analysis/MockTokenizer.java

   1 package org.apache.lucene.analysis;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.io.Reader;
  22
  23 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  24 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  25 import org.apache.lucene.util.AttributeSource.AttributeFactory;
  26
  27 /**
  28  * Tokenizer for testing.
  29  * <p>
  30  * This tokenizer is a replacement for {@link #WHITESPACE}, {@link #SIMPLE}, and {@link #KEYWORD}
  31  * tokenizers. If you are writing a component such as a TokenFilter, its a great idea to test
  32  * it wrapping this tokenizer instead for extra checks. This tokenizer has the following behavior:
  33  * <ul>
  34  *   <li>An internal state-machine is used for checking consumer consistency. These checks can
  35  *       be disabled with {@link #setEnableChecks(boolean)}.
  36  *   <li>For convenience, optionally lowercases terms that it outputs.
  37  * </ul>
  38  */
  39 public class MockTokenizer extends Tokenizer {
  40   /** Acts Similar to WhitespaceTokenizer */
  41   public static final int WHITESPACE = 0;
  42   /** Acts Similar to KeywordTokenizer.
  43    * TODO: Keyword returns an "empty" token for an empty reader...
  44    */
  45   public static final int KEYWORD = 1;
  46   /** Acts like LetterTokenizer. */
  47   public static final int SIMPLE = 2;
  48
  49   private final int pattern;
  50   private final boolean lowerCase;
  51   private final int maxTokenLength;
  52   public static final int DEFAULT_MAX_TOKEN_LENGTH = Integer.MAX_VALUE;
  53
  54   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  55   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  56   int off = 0;
  57
  58   // TODO: "register" with LuceneTestCase to ensure all streams are closed() ?
  59   // currently, we can only check that the lifecycle is correct if someone is reusing,
  60   // but not for "one-offs".
  61   private static enum State {
  62     SETREADER,       // consumer set a reader input either via ctor or via reset(Reader)
  63     RESET,           // consumer has called reset()
  64     INCREMENT,       // consumer is consuming, has called incrementToken() == true
  65     INCREMENT_FALSE, // consumer has called incrementToken() which returned false
  66     END,             // consumer has called end() to perform end of stream operations
  67     CLOSE            // consumer has called close() to release any resources
  68   };
  69
  70   private State streamState = State.CLOSE;
  71   private boolean enableChecks = true;
  72
  73   public MockTokenizer(AttributeFactory factory, Reader input, int pattern, boolean lowerCase, int maxTokenLength) {
  74     super(factory, input);
  75     this.pattern = pattern;
  76     this.lowerCase = lowerCase;
  77     this.streamState = State.SETREADER;
  78     this.maxTokenLength = maxTokenLength;
  79   }
  80
  81   public MockTokenizer(Reader input, int pattern, boolean lowerCase, int maxTokenLength) {
  82     this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, pattern, lowerCase, maxTokenLength);
  83   }
  84
  85   public MockTokenizer(Reader input, int pattern, boolean lowerCase) {
  86     this(input, pattern, lowerCase, DEFAULT_MAX_TOKEN_LENGTH);
  87   }
  88
  89   @Override
  90   public final boolean incrementToken() throws IOException {
  91     assert !enableChecks || (streamState == State.RESET || streamState == State.INCREMENT)
  92                             : "incrementToken() called while in wrong state: " + streamState;
  93     clearAttributes();
  94     for (;;) {
  95       int startOffset = off;
  96       int cp = readCodePoint();
  97       if (cp < 0) {
  98         break;
  99       } else if (isTokenChar(cp)) {
 100         int endOffset;
 101         do {
 102           char chars[] = Character.toChars(normalize(cp));
 103           for (int i = 0; i < chars.length; i++)
 104             termAtt.append(chars[i]);
 105           endOffset = off;
 106           if (termAtt.length() >= maxTokenLength) {
 107             break;
 108           }
 109           cp = readCodePoint();
 110         } while (cp >= 0 && isTokenChar(cp));
 111         offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset));
 112         streamState = State.INCREMENT;
 113         return true;
 114       }
 115     }
 116     streamState = State.INCREMENT_FALSE;
 117     return false;
 118   }
 119
 120   protected int readCodePoint() throws IOException {
 121     int ch = input.read();
 122     if (ch < 0) {
 123       return ch;
 124     } else {
 125       assert ch != 0xffff; /* only on 3.x */
 126       assert !Character.isLowSurrogate((char) ch);
 127       off++;
 128       if (Character.isHighSurrogate((char) ch)) {
 129         int ch2 = input.read();
 130         if (ch2 >= 0) {
 131           off++;
 132           assert Character.isLowSurrogate((char) ch2);
 133           return Character.toCodePoint((char) ch, (char) ch2);
 134         }
 135       }
 136       return ch;
 137     }
 138   }
 139
 140   protected boolean isTokenChar(int c) {
 141     switch(pattern) {
 142       case WHITESPACE: return !Character.isWhitespace(c);
 143       case KEYWORD: return true;
 144       case SIMPLE: return Character.isLetter(c);
 145       default: throw new RuntimeException("invalid pattern constant:" + pattern);
 146     }
 147   }
 148
 149   protected int normalize(int c) {
 150     return lowerCase ? Character.toLowerCase(c) : c;
 151   }
 152
 153   @Override
 154   public void reset() throws IOException {
 155     super.reset();
 156     off = 0;
 157     assert !enableChecks || streamState != State.RESET : "double reset()";
 158     streamState = State.RESET;
 159   }
 160
 161   @Override
 162   public void close() throws IOException {
 163     super.close();
 164     // in some exceptional cases (e.g. TestIndexWriterExceptions) a test can prematurely close()
 165     // these tests should disable this check, by default we check the normal workflow.
 166     // TODO: investigate the CachingTokenFilter "double-close"... for now we ignore this
 167     assert !enableChecks || streamState == State.END || streamState == State.CLOSE : "close() called in wrong state: " + streamState;
 168     streamState = State.CLOSE;
 169   }
 170
 171   @Override
 172   public void reset(Reader input) throws IOException {
 173     super.reset(input);
 174     assert !enableChecks || streamState == State.CLOSE : "setReader() called in wrong state: " + streamState;
 175     streamState = State.SETREADER;
 176   }
 177
 178   @Override
 179   public void end() throws IOException {
 180     int finalOffset = correctOffset(off);
 181     offsetAtt.setOffset(finalOffset, finalOffset);
 182     // some tokenizers, such as limiting tokenizers, call end() before incrementToken() returns false.
 183     // these tests should disable this check (in general you should consume the entire stream)
 184     assert !enableChecks || streamState == State.INCREMENT_FALSE : "end() called before incrementToken() returned false!";
 185     streamState = State.END;
 186   }
 187
 188   /**
 189    * Toggle consumer workflow checking: if your test consumes tokenstreams normally you
 190    * should leave this enabled.
 191    */
 192   public void setEnableChecks(boolean enableChecks) {
 193     this.enableChecks = enableChecks;
 194   }
 195 }