1 package org.apache.lucene.analysis.cjk;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.io.Reader;
23 import org.apache.lucene.analysis.Tokenizer;
24 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
25 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
26 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
27 import org.apache.lucene.util.AttributeSource;
30 * CJKTokenizer is designed for Chinese, Japanese, and Korean languages.
32 * The tokens returned are every two adjacent characters with overlap match.
35 * Example: "java C1C2C3C4" will be segmented to: "java" "C1C2" "C2C3" "C3C4".
37 * Additionally, the following is applied to Latin text (such as English):
39 * <li>Text is converted to lowercase.
40 * <li>Numeric digits, '+', '#', and '_' are tokenized as letters.
41 * <li>Full-width forms are converted to half-width forms.
43 * For more info on Asian language (Chinese, Japanese, and Korean) text segmentation:
45 * href="http://www.google.com/search?q=word+chinese+segment">google</a>
48 public final class CJKTokenizer extends Tokenizer {
49 //~ Static fields/initializers ---------------------------------------------
50 /** Word token type */
51 static final int WORD_TYPE = 0;
53 /** Single byte token type */
54 static final int SINGLE_TOKEN_TYPE = 1;
56 /** Double byte token type */
57 static final int DOUBLE_TOKEN_TYPE = 2;
59 /** Names for token types */
60 static final String[] TOKEN_TYPE_NAMES = { "word", "single", "double" };
62 /** Max word length */
63 private static final int MAX_WORD_LEN = 255;
66 private static final int IO_BUFFER_SIZE = 256;
68 //~ Instance fields --------------------------------------------------------
70 /** word offset, used to imply which character(in ) is parsed */
71 private int offset = 0;
73 /** the index used only for ioBuffer */
74 private int bufferIndex = 0;
77 private int dataLen = 0;
80 * character buffer, store the characters which are used to compose <br>
83 private final char[] buffer = new char[MAX_WORD_LEN];
86 * I/O buffer, used to store the content of the input(one of the <br>
87 * members of Tokenizer)
89 private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
91 /** word type: single=>ASCII double=>non-ASCII word=>default */
92 private int tokenType = WORD_TYPE;
95 * tag: previous character is a cached double-byte character "C1C2C3C4"
96 * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
97 * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
99 private boolean preIsTokened = false;
101 private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
102 private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
103 private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
105 //~ Constructors -----------------------------------------------------------
108 * Construct a token stream processing the given input.
110 * @param in I/O reader
112 public CJKTokenizer(Reader in) {
116 public CJKTokenizer(AttributeSource source, Reader in) {
120 public CJKTokenizer(AttributeFactory factory, Reader in) {
124 //~ Methods ----------------------------------------------------------------
127 * Returns true for the next token in the stream, or false at EOS.
128 * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
131 * @return false for end of stream, true otherwise
133 * @throws java.io.IOException - throw IOException when read error <br>
134 * happened in the InputStream
138 public boolean incrementToken() throws IOException {
140 /** how many character(s) has been stored in buffer */
142 while(true) { // loop until we find a non-empty token
146 /** the position used to create Token */
149 while (true) { // loop until we've found a full token
150 /** current character */
153 /** unicode block of current character for detail */
154 Character.UnicodeBlock ub;
158 if (bufferIndex >= dataLen) {
159 dataLen = input.read(ioBuffer);
165 if (preIsTokened == true) {
167 preIsTokened = false;
179 //get current character
180 c = ioBuffer[bufferIndex++];
182 //get the UnicodeBlock of the current character
183 ub = Character.UnicodeBlock.of(c);
186 //if the current character is ASCII or Extend ASCII
187 if ((ub == Character.UnicodeBlock.BASIC_LATIN)
188 || (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)
190 if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
192 if (i >= 65281 && i <= 65374) {
193 // convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
199 // if the current character is a letter or "_" "+" "#"
200 if (Character.isLetterOrDigit(c)
201 || ((c == '_') || (c == '+') || (c == '#'))
204 // "javaC1C2C3C4linux" <br>
205 // ^--: the current character begin to token the ASCII
208 } else if (tokenType == DOUBLE_TOKEN_TYPE) {
209 // "javaC1C2C3C4linux" <br>
210 // ^--: the previous non-ASCII
211 // : the current character
215 if (preIsTokened == true) {
216 // there is only one non-ASCII has been stored
218 preIsTokened = false;
225 // store the LowerCase(c) in the buffer
226 buffer[length++] = Character.toLowerCase(c);
227 tokenType = SINGLE_TOKEN_TYPE;
229 // break the procedure if buffer overflowed!
230 if (length == MAX_WORD_LEN) {
233 } else if (length > 0) {
234 if (preIsTokened == true) {
236 preIsTokened = false;
242 // non-ASCII letter, e.g."C1C2C3C4"
243 if (Character.isLetter(c)) {
246 buffer[length++] = c;
247 tokenType = DOUBLE_TOKEN_TYPE;
249 if (tokenType == SINGLE_TOKEN_TYPE) {
253 //return the previous ASCII characters
256 buffer[length++] = c;
257 tokenType = DOUBLE_TOKEN_TYPE;
268 } else if (length > 0) {
269 if (preIsTokened == true) {
272 preIsTokened = false;
281 termAtt.copyBuffer(buffer, 0, length);
282 offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
283 typeAtt.setType(TOKEN_TYPE_NAMES[tokenType]);
285 } else if (dataLen == -1) {
290 // Cycle back and try for the next token (don't
291 // return an empty string)
296 public final void end() {
298 final int finalOffset = correctOffset(offset);
299 this.offsetAtt.setOffset(finalOffset, finalOffset);
303 public void reset() throws IOException {
305 offset = bufferIndex = dataLen = 0;
306 preIsTokened = false;
307 tokenType = WORD_TYPE;
311 public void reset(Reader reader) throws IOException {