lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java

   1 package org.apache.lucene.analysis.cjk;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.io.Reader;
  22
  23 import org.apache.lucene.analysis.Tokenizer;
  24 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  25 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  26 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
  27 import org.apache.lucene.util.AttributeSource;
  28
  29 /**
  30  * CJKTokenizer is designed for Chinese, Japanese, and Korean languages.
  31  * <p>
  32  * The tokens returned are every two adjacent characters with overlap match.
  33  * </p>
  34  * <p>
  35  * Example: "java C1C2C3C4" will be segmented to: "java" "C1C2" "C2C3" "C3C4".
  36  * </p>
  37  * Additionally, the following is applied to Latin text (such as English):
  38  * <ul>
  39  * <li>Text is converted to lowercase.
  40  * <li>Numeric digits, '+', '#', and '_' are tokenized as letters.
  41  * <li>Full-width forms are converted to half-width forms.
  42  * </ul>
  43  * For more info on Asian language (Chinese, Japanese, and Korean) text segmentation:
  44  * please search  <a
  45  * href="http://www.google.com/search?q=word+chinese+segment">google</a>
  46  *
  47  */
  48 public final class CJKTokenizer extends Tokenizer {
  49     //~ Static fields/initializers ---------------------------------------------
  50     /** Word token type */
  51     static final int WORD_TYPE = 0;
  52
  53     /** Single byte token type */
  54     static final int SINGLE_TOKEN_TYPE = 1;
  55
  56     /** Double byte token type */
  57     static final int DOUBLE_TOKEN_TYPE = 2;
  58
  59     /** Names for token types */
  60     static final String[] TOKEN_TYPE_NAMES = { "word", "single", "double" };
  61
  62     /** Max word length */
  63     private static final int MAX_WORD_LEN = 255;
  64
  65     /** buffer size: */
  66     private static final int IO_BUFFER_SIZE = 256;
  67
  68     //~ Instance fields --------------------------------------------------------
  69
  70     /** word offset, used to imply which character(in ) is parsed */
  71     private int offset = 0;
  72
  73     /** the index used only for ioBuffer */
  74     private int bufferIndex = 0;
  75
  76     /** data length */
  77     private int dataLen = 0;
  78
  79     /**
  80      * character buffer, store the characters which are used to compose <br>
  81      * the returned Token
  82      */
  83     private final char[] buffer = new char[MAX_WORD_LEN];
  84
  85     /**
  86      * I/O buffer, used to store the content of the input(one of the <br>
  87      * members of Tokenizer)
  88      */
  89     private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
  90
  91     /** word type: single=>ASCII  double=>non-ASCII word=>default */
  92     private int tokenType = WORD_TYPE;
  93
  94     /**
  95      * tag: previous character is a cached double-byte character  "C1C2C3C4"
  96      * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
  97      * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
  98      */
  99     private boolean preIsTokened = false;
 100
 101     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
 102     private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
 103     private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
 104
 105     //~ Constructors -----------------------------------------------------------
 106
 107     /**
 108      * Construct a token stream processing the given input.
 109      *
 110      * @param in I/O reader
 111      */
 112     public CJKTokenizer(Reader in) {
 113       super(in);
 114     }
 115
 116     public CJKTokenizer(AttributeSource source, Reader in) {
 117       super(source, in);
 118     }
 119
 120     public CJKTokenizer(AttributeFactory factory, Reader in) {
 121       super(factory, in);
 122     }
 123
 124     //~ Methods ----------------------------------------------------------------
 125
 126     /**
 127      * Returns true for the next token in the stream, or false at EOS.
 128      * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
 129      * for detail.
 130      *
 131      * @return false for end of stream, true otherwise
 132      *
 133      * @throws java.io.IOException - throw IOException when read error <br>
 134      *         happened in the InputStream
 135      *
 136      */
 137     @Override
 138     public boolean incrementToken() throws IOException {
 139         clearAttributes();
 140         /** how many character(s) has been stored in buffer */
 141
 142         while(true) { // loop until we find a non-empty token
 143
 144           int length = 0;
 145
 146           /** the position used to create Token */
 147           int start = offset;
 148
 149           while (true) { // loop until we've found a full token
 150             /** current character */
 151             char c;
 152
 153             /** unicode block of current character for detail */
 154             Character.UnicodeBlock ub;
 155
 156             offset++;
 157
 158             if (bufferIndex >= dataLen) {
 159                 dataLen = input.read(ioBuffer);
 160                 bufferIndex = 0;
 161             }
 162
 163             if (dataLen == -1) {
 164                 if (length > 0) {
 165                     if (preIsTokened == true) {
 166                         length = 0;
 167                         preIsTokened = false;
 168                     }
 169                     else{
 170                       offset--;
 171                     }
 172
 173                     break;
 174                 } else {
 175                     offset--;
 176                     return false;
 177                 }
 178             } else {
 179                 //get current character
 180                 c = ioBuffer[bufferIndex++];
 181
 182                 //get the UnicodeBlock of the current character
 183                 ub = Character.UnicodeBlock.of(c);
 184             }
 185
 186             //if the current character is ASCII or Extend ASCII
 187             if ((ub == Character.UnicodeBlock.BASIC_LATIN)
 188                     || (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)
 189                ) {
 190                 if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
 191                   int i = (int) c;
 192                   if (i >= 65281 && i <= 65374) {
 193                     // convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
 194                     i = i - 65248;
 195                     c = (char) i;
 196                   }
 197                 }
 198
 199                 // if the current character is a letter or "_" "+" "#"
 200                 if (Character.isLetterOrDigit(c)
 201                         || ((c == '_') || (c == '+') || (c == '#'))
 202                    ) {
 203                     if (length == 0) {
 204                         // "javaC1C2C3C4linux" <br>
 205                         //      ^--: the current character begin to token the ASCII
 206                         // letter
 207                         start = offset - 1;
 208                     } else if (tokenType == DOUBLE_TOKEN_TYPE) {
 209                         // "javaC1C2C3C4linux" <br>
 210                         //              ^--: the previous non-ASCII
 211                         // : the current character
 212                         offset--;
 213                         bufferIndex--;
 214
 215                         if (preIsTokened == true) {
 216                             // there is only one non-ASCII has been stored
 217                             length = 0;
 218                             preIsTokened = false;
 219                             break;
 220                         } else {
 221                             break;
 222                         }
 223                     }
 224
 225                     // store the LowerCase(c) in the buffer
 226                     buffer[length++] = Character.toLowerCase(c);
 227                     tokenType = SINGLE_TOKEN_TYPE;
 228
 229                     // break the procedure if buffer overflowed!
 230                     if (length == MAX_WORD_LEN) {
 231                         break;
 232                     }
 233                 } else if (length > 0) {
 234                     if (preIsTokened == true) {
 235                         length = 0;
 236                         preIsTokened = false;
 237                     } else {
 238                         break;
 239                     }
 240                 }
 241             } else {
 242                 // non-ASCII letter, e.g."C1C2C3C4"
 243                 if (Character.isLetter(c)) {
 244                     if (length == 0) {
 245                         start = offset - 1;
 246                         buffer[length++] = c;
 247                         tokenType = DOUBLE_TOKEN_TYPE;
 248                     } else {
 249                       if (tokenType == SINGLE_TOKEN_TYPE) {
 250                             offset--;
 251                             bufferIndex--;
 252
 253                             //return the previous ASCII characters
 254                             break;
 255                         } else {
 256                             buffer[length++] = c;
 257                             tokenType = DOUBLE_TOKEN_TYPE;
 258
 259                             if (length == 2) {
 260                                 offset--;
 261                                 bufferIndex--;
 262                                 preIsTokened = true;
 263
 264                                 break;
 265                             }
 266                         }
 267                     }
 268                 } else if (length > 0) {
 269                     if (preIsTokened == true) {
 270                         // empty the buffer
 271                         length = 0;
 272                         preIsTokened = false;
 273                     } else {
 274                         break;
 275                     }
 276                 }
 277             }
 278         }
 279
 280         if (length > 0) {
 281           termAtt.copyBuffer(buffer, 0, length);
 282           offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
 283           typeAtt.setType(TOKEN_TYPE_NAMES[tokenType]);
 284           return true;
 285         } else if (dataLen == -1) {
 286           offset--;
 287           return false;
 288         }
 289
 290         // Cycle back and try for the next token (don't
 291         // return an empty string)
 292       }
 293     }
 294
 295     @Override
 296     public final void end() {
 297       // set final offset
 298       final int finalOffset = correctOffset(offset);
 299       this.offsetAtt.setOffset(finalOffset, finalOffset);
 300     }
 301
 302     @Override
 303     public void reset() throws IOException {
 304       super.reset();
 305       offset = bufferIndex = dataLen = 0;
 306       preIsTokened = false;
 307       tokenType = WORD_TYPE;
 308     }
 309
 310     @Override
 311     public void reset(Reader reader) throws IOException {
 312       super.reset(reader);
 313       reset();
 314     }
 315 }