1 package org.apache.lucene.analysis.cn;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 import java.io.IOException;
22 import java.io.Reader;
24 import org.apache.lucene.analysis.standard.StandardTokenizer;
25 import org.apache.lucene.analysis.Tokenizer;
26 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
27 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
28 import org.apache.lucene.util.AttributeSource;
32 * Tokenize Chinese text as individual chinese characters.
35 * The difference between ChineseTokenizer and
36 * CJKTokenizer is that they have different
37 * token parsing logic.
40 * For example, if the Chinese text
41 * "C1C2C3C4" is to be indexed:
43 * <li>The tokens returned from ChineseTokenizer are C1, C2, C3, C4.
44 * <li>The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4.
48 * Therefore the index created by CJKTokenizer is much larger.
51 * The problem is that when searching for C1, C1C2, C1C3,
52 * C4C2, C1C2C3 ... the ChineseTokenizer works, but the
53 * CJKTokenizer will not work.
56 * @deprecated Use {@link StandardTokenizer} instead, which has the same functionality.
57 * This filter will be removed in Lucene 5.0
60 public final class ChineseTokenizer extends Tokenizer {
63 public ChineseTokenizer(Reader in) {
67 public ChineseTokenizer(AttributeSource source, Reader in) {
71 public ChineseTokenizer(AttributeFactory factory, Reader in) {
75 private int offset = 0, bufferIndex=0, dataLen=0;
76 private final static int MAX_WORD_LEN = 255;
77 private final static int IO_BUFFER_SIZE = 1024;
78 private final char[] buffer = new char[MAX_WORD_LEN];
79 private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
85 private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
86 private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
88 private final void push(char c) {
90 if (length == 0) start = offset-1; // start of token
91 buffer[length++] = Character.toLowerCase(c); // buffer it
95 private final boolean flush() {
98 //System.out.println(new String(buffer, 0,
100 termAtt.copyBuffer(buffer, 0, length);
101 offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
109 public boolean incrementToken() throws IOException {
121 if (bufferIndex >= dataLen) {
122 dataLen = input.read(ioBuffer);
130 c = ioBuffer[bufferIndex++];
133 switch(Character.getType(c)) {
135 case Character.DECIMAL_DIGIT_NUMBER:
136 case Character.LOWERCASE_LETTER:
137 case Character.UPPERCASE_LETTER:
139 if (length == MAX_WORD_LEN) return flush();
142 case Character.OTHER_LETTER:
152 if (length>0) return flush();
159 public final void end() {
161 final int finalOffset = correctOffset(offset);
162 this.offsetAtt.setOffset(finalOffset, finalOffset);
166 public void reset() throws IOException {
168 offset = bufferIndex = dataLen = 0;
172 public void reset(Reader input) throws IOException {