lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/CharTokenizer.java

   1 package org.apache.lucene.analysis;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.io.Reader;
  22
  23 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  24 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  25 import org.apache.lucene.util.AttributeSource;
  26 import org.apache.lucene.util.CharacterUtils;
  27 import org.apache.lucene.util.Version;
  28 import org.apache.lucene.util.VirtualMethod;
  29 import org.apache.lucene.util.CharacterUtils.CharacterBuffer;
  30
  31 /**
  32  * An abstract base class for simple, character-oriented tokenizers.
  33  * <p>
  34  * <a name="version">You must specify the required {@link Version} compatibility
  35  * when creating {@link CharTokenizer}:
  36  * <ul>
  37  * <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
  38  * detect token codepoints. See {@link #isTokenChar(int)} and
  39  * {@link #normalize(int)} for details.</li>
  40  * </ul>
  41  * <p>
  42  * A new {@link CharTokenizer} API has been introduced with Lucene 3.1. This API
  43  * moved from UTF-16 code units to UTF-32 codepoints to eventually add support
  44  * for <a href=
  45  * "http://java.sun.com/j2se/1.5.0/docs/api/java/lang/Character.html#supplementary"
  46  * >supplementary characters</a>. The old <i>char</i> based API has been
  47  * deprecated and should be replaced with the <i>int</i> based methods
  48  * {@link #isTokenChar(int)} and {@link #normalize(int)}.
  49  * </p>
  50  * <p>
  51  * As of Lucene 3.1 each {@link CharTokenizer} - constructor expects a
  52  * {@link Version} argument. Based on the given {@link Version} either the new
  53  * API or a backwards compatibility layer is used at runtime. For
  54  * {@link Version} < 3.1 the backwards compatibility layer ensures correct
  55  * behavior even for indexes build with previous versions of Lucene. If a
  56  * {@link Version} >= 3.1 is used {@link CharTokenizer} requires the new API to
  57  * be implemented by the instantiated class. Yet, the old <i>char</i> based API
  58  * is not required anymore even if backwards compatibility must be preserved.
  59  * {@link CharTokenizer} subclasses implementing the new API are fully backwards
  60  * compatible if instantiated with {@link Version} < 3.1.
  61  * </p>
  62  * <p>
  63  * <strong>Note:</strong> If you use a subclass of {@link CharTokenizer} with {@link Version} >=
  64  * 3.1 on an index build with a version < 3.1, created tokens might not be
  65  * compatible with the terms in your index.
  66  * </p>
  67  **/
  68 public abstract class CharTokenizer extends Tokenizer {
  69
  70   /**
  71    * Creates a new {@link CharTokenizer} instance
  72    *
  73    * @param matchVersion
  74    *          Lucene version to match See {@link <a href="#version">above</a>}
  75    * @param input
  76    *          the input to split up into tokens
  77    */
  78   public CharTokenizer(Version matchVersion, Reader input) {
  79     super(input);
  80     charUtils = CharacterUtils.getInstance(matchVersion);
  81     useOldAPI = useOldAPI(matchVersion);
  82
  83   }
  84
  85   /**
  86    * Creates a new {@link CharTokenizer} instance
  87    *
  88    * @param matchVersion
  89    *          Lucene version to match See {@link <a href="#version">above</a>}
  90    * @param source
  91    *          the attribute source to use for this {@link Tokenizer}
  92    * @param input
  93    *          the input to split up into tokens
  94    */
  95   public CharTokenizer(Version matchVersion, AttributeSource source,
  96       Reader input) {
  97     super(source, input);
  98     charUtils = CharacterUtils.getInstance(matchVersion);
  99     useOldAPI = useOldAPI(matchVersion);
 100   }
 101
 102   /**
 103    * Creates a new {@link CharTokenizer} instance
 104    *
 105    * @param matchVersion
 106    *          Lucene version to match See {@link <a href="#version">above</a>}
 107    * @param factory
 108    *          the attribute factory to use for this {@link Tokenizer}
 109    * @param input
 110    *          the input to split up into tokens
 111    */
 112   public CharTokenizer(Version matchVersion, AttributeFactory factory,
 113       Reader input) {
 114     super(factory, input);
 115     charUtils = CharacterUtils.getInstance(matchVersion);
 116     useOldAPI = useOldAPI(matchVersion);
 117   }
 118
 119   /**
 120    * Creates a new {@link CharTokenizer} instance
 121    * @param input the input to split up into tokens
 122    * @deprecated use {@link #CharTokenizer(Version, Reader)} instead. This will be
 123    *             removed in Lucene 4.0.
 124    */
 125   @Deprecated
 126   public CharTokenizer(Reader input) {
 127     this(Version.LUCENE_30, input);
 128   }
 129
 130   /**
 131    * Creates a new {@link CharTokenizer} instance
 132    * @param input the input to split up into tokens
 133    * @param source the attribute source to use for this {@link Tokenizer}
 134    * @deprecated use {@link #CharTokenizer(Version, AttributeSource, Reader)} instead. This will be
 135    *             removed in Lucene 4.0.
 136    */
 137   @Deprecated
 138   public CharTokenizer(AttributeSource source, Reader input) {
 139     this(Version.LUCENE_30, source, input);
 140   }
 141
 142   /**
 143    * Creates a new {@link CharTokenizer} instance
 144    * @param input the input to split up into tokens
 145    * @param factory the attribute factory to use for this {@link Tokenizer}
 146    * @deprecated use {@link #CharTokenizer(Version, AttributeSource.AttributeFactory, Reader)} instead. This will be
 147    *             removed in Lucene 4.0.
 148    */
 149   @Deprecated
 150   public CharTokenizer(AttributeFactory factory, Reader input) {
 151     this(Version.LUCENE_30, factory, input);
 152   }
 153
 154   private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
 155   private static final int MAX_WORD_LEN = 255;
 156   private static final int IO_BUFFER_SIZE = 4096;
 157
 158   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);;
 159   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
 160
 161   private final CharacterUtils charUtils;
 162   private final CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE);
 163
 164   /**
 165    * @deprecated this will be removed in lucene 4.0
 166    */
 167   @Deprecated
 168   private final boolean useOldAPI;
 169
 170   /**
 171    * @deprecated this will be removed in lucene 4.0
 172    */
 173   @Deprecated
 174   private static final VirtualMethod<CharTokenizer> isTokenCharMethod =
 175     new VirtualMethod<CharTokenizer>(CharTokenizer.class, "isTokenChar", char.class);
 176
 177   /**
 178    * @deprecated this will be removed in lucene 4.0
 179    */
 180   @Deprecated
 181   private static final VirtualMethod<CharTokenizer> normalizeMethod =
 182     new VirtualMethod<CharTokenizer>(CharTokenizer.class, "normalize", char.class);
 183
 184   /**
 185    * Returns true iff a UTF-16 code unit should be included in a token. This
 186    * tokenizer generates as tokens adjacent sequences of characters which
 187    * satisfy this predicate. Characters for which this is <code>false</code> are
 188    * used to define token boundaries and are not included in tokens.
 189    * <p>
 190    * Note: This method cannot handle <a href=
 191    * "http://java.sun.com/j2se/1.5.0/docs/api/java/lang/Character.html#supplementary"
 192    * >supplementary characters</a>. To support all Unicode characters, including
 193    * supplementary characters, use the {@link #isTokenChar(int)} method.
 194    * </p>
 195    *
 196    * @deprecated use {@link #isTokenChar(int)} instead. This method will be
 197    *             removed in Lucene 4.0.
 198    */
 199   @Deprecated
 200   protected boolean isTokenChar(char c) {
 201     return isTokenChar((int)c);
 202   }
 203
 204   /**
 205    * Called on each token UTF-16 code unit to normalize it before it is added to the
 206    * token. The default implementation does nothing. Subclasses may use this to,
 207    * e.g., lowercase tokens.
 208    * <p>
 209    * Note: This method cannot handle <a href=
 210    * "http://java.sun.com/j2se/1.5.0/docs/api/java/lang/Character.html#supplementary"
 211    * >supplementary characters</a>. To support all Unicode characters, including
 212    * supplementary characters, use the {@link #normalize(int)} method.
 213    * </p>
 214    *
 215    * @deprecated use {@link #normalize(int)} instead. This method will be
 216    *             removed in Lucene 4.0.
 217    */
 218   @Deprecated
 219   protected char normalize(char c) {
 220     return (char) normalize((int) c);
 221   }
 222
 223   /**
 224    * Returns true iff a codepoint should be included in a token. This tokenizer
 225    * generates as tokens adjacent sequences of codepoints which satisfy this
 226    * predicate. Codepoints for which this is false are used to define token
 227    * boundaries and are not included in tokens.
 228    * <p>
 229    * As of Lucene 3.1 the char based API ({@link #isTokenChar(char)} and
 230    * {@link #normalize(char)}) has been depreciated in favor of a Unicode 4.0
 231    * compatible int based API to support codepoints instead of UTF-16 code
 232    * units. Subclasses of {@link CharTokenizer} must not override the char based
 233    * methods if a {@link Version} >= 3.1 is passed to the constructor.
 234    * <p>
 235    * <p>
 236    * NOTE: This method will be marked <i>abstract</i> in Lucene 4.0.
 237    * </p>
 238    */
 239   protected boolean isTokenChar(int c) {
 240     throw new UnsupportedOperationException("since LUCENE_31 subclasses of CharTokenizer must implement isTokenChar(int)");
 241   }
 242
 243   /**
 244    * Called on each token character to normalize it before it is added to the
 245    * token. The default implementation does nothing. Subclasses may use this to,
 246    * e.g., lowercase tokens.
 247    * <p>
 248    * As of Lucene 3.1 the char based API ({@link #isTokenChar(char)} and
 249    * {@link #normalize(char)}) has been depreciated in favor of a Unicode 4.0
 250    * compatible int based API to support codepoints instead of UTF-16 code
 251    * units. Subclasses of {@link CharTokenizer} must not override the char based
 252    * methods if a {@link Version} >= 3.1 is passed to the constructor.
 253    * <p>
 254    * <p>
 255    * NOTE: This method will be marked <i>abstract</i> in Lucene 4.0.
 256    * </p>
 257    */
 258   protected int normalize(int c) {
 259     return c;
 260   }
 261
 262   @Override
 263   public final boolean incrementToken() throws IOException {
 264     clearAttributes();
 265     if(useOldAPI) // TODO remove this in LUCENE 4.0
 266       return incrementTokenOld();
 267     int length = 0;
 268     int start = -1; // this variable is always initialized
 269     char[] buffer = termAtt.buffer();
 270     while (true) {
 271       if (bufferIndex >= dataLen) {
 272         offset += dataLen;
 273         if(!charUtils.fill(ioBuffer, input)) { // read supplementary char aware with CharacterUtils
 274           dataLen = 0; // so next offset += dataLen won't decrement offset
 275           if (length > 0) {
 276             break;
 277           } else {
 278             finalOffset = correctOffset(offset);
 279             return false;
 280           }
 281         }
 282         dataLen = ioBuffer.getLength();
 283         bufferIndex = 0;
 284       }
 285       // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
 286       final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex);
 287       bufferIndex += Character.charCount(c);
 288
 289       if (isTokenChar(c)) {               // if it's a token char
 290         if (length == 0) {                // start of token
 291           assert start == -1;
 292           start = offset + bufferIndex - 1;
 293         } else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds
 294           buffer = termAtt.resizeBuffer(2+length); // make sure a supplementary fits in the buffer
 295         }
 296         length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized
 297         if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test
 298           break;
 299       } else if (length > 0)             // at non-Letter w/ chars
 300         break;                           // return 'em
 301     }
 302
 303     termAtt.setLength(length);
 304     assert start != -1;
 305     offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(start+length));
 306     return true;
 307
 308   }
 309
 310   /**
 311    * The <= 3.0 version of incrementToken. This is a backwards compat implementation used
 312    * if a version <= 3.0 is provided to the ctor.
 313    * @deprecated remove in 4.0
 314    */
 315   @Deprecated
 316   private boolean incrementTokenOld() throws IOException {
 317     int length = 0;
 318     int start = -1; // this variable is always initialized
 319     char[] buffer = termAtt.buffer();
 320     final char[] oldIoBuffer = ioBuffer.getBuffer();
 321     while (true) {
 322
 323       if (bufferIndex >= dataLen) {
 324         offset += dataLen;
 325         dataLen = input.read(oldIoBuffer);
 326         if (dataLen == -1) {
 327           dataLen = 0;                            // so next offset += dataLen won't decrement offset
 328           if (length > 0) {
 329             break;
 330           } else {
 331             finalOffset = correctOffset(offset);
 332             return false;
 333           }
 334         }
 335         bufferIndex = 0;
 336       }
 337
 338       final char c = oldIoBuffer[bufferIndex++];
 339
 340       if (isTokenChar(c)) {               // if it's a token char
 341
 342         if (length == 0) {                // start of token
 343           assert start == -1;
 344           start = offset + bufferIndex - 1;
 345         } else if (length == buffer.length) {
 346           buffer = termAtt.resizeBuffer(1+length);
 347         }
 348
 349         buffer[length++] = normalize(c); // buffer it, normalized
 350
 351         if (length == MAX_WORD_LEN)      // buffer overflow!
 352           break;
 353
 354       } else if (length > 0)             // at non-Letter w/ chars
 355         break;                           // return 'em
 356     }
 357
 358     termAtt.setLength(length);
 359     assert start != -1;
 360     offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
 361     return true;
 362   }
 363
 364
 365
 366   @Override
 367   public final void end() {
 368     // set final offset
 369     offsetAtt.setOffset(finalOffset, finalOffset);
 370   }
 371
 372   @Override
 373   public void reset(Reader input) throws IOException {
 374     super.reset(input);
 375     bufferIndex = 0;
 376     offset = 0;
 377     dataLen = 0;
 378     finalOffset = 0;
 379     ioBuffer.reset(); // make sure to reset the IO buffer!!
 380   }
 381
 382   /**
 383    * @deprecated this will be removed in lucene 4.0
 384    */
 385   @Deprecated
 386   private boolean useOldAPI(Version matchVersion) {
 387     final Class<? extends CharTokenizer> clazz = this.getClass();
 388     if (matchVersion.onOrAfter(Version.LUCENE_31)
 389         && (isTokenCharMethod.isOverriddenAsOf(clazz) || normalizeMethod
 390             .isOverriddenAsOf(clazz))) throw new IllegalArgumentException(
 391         "For matchVersion >= LUCENE_31, CharTokenizer subclasses must not override isTokenChar(char) or normalize(char).");
 392     return !matchVersion.onOrAfter(Version.LUCENE_31);
 393   }
 394 }