lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java

   1 package org.apache.lucene.analysis.cn;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20
  21 import java.io.IOException;
  22 import java.io.Reader;
  23
  24 import org.apache.lucene.analysis.standard.StandardTokenizer;
  25 import org.apache.lucene.analysis.Tokenizer;
  26 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  27 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  28 import org.apache.lucene.util.AttributeSource;
  29
  30
  31 /**
  32  * Tokenize Chinese text as individual chinese characters.
  33  *
  34  * <p>
  35  * The difference between ChineseTokenizer and
  36  * CJKTokenizer is that they have different
  37  * token parsing logic.
  38  * </p>
  39  * <p>
  40  * For example, if the Chinese text
  41  * "C1C2C3C4" is to be indexed:
  42  * <ul>
  43  * <li>The tokens returned from ChineseTokenizer are C1, C2, C3, C4.
  44  * <li>The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4.
  45  * </ul>
  46  * </p>
  47  * <p>
  48  * Therefore the index created by CJKTokenizer is much larger.
  49  * </p>
  50  * <p>
  51  * The problem is that when searching for C1, C1C2, C1C3,
  52  * C4C2, C1C2C3 ... the ChineseTokenizer works, but the
  53  * CJKTokenizer will not work.
  54  * </p>
  55  * @version 1.0
  56  * @deprecated Use {@link StandardTokenizer} instead, which has the same functionality.
  57  * This filter will be removed in Lucene 5.0
  58  */
  59 @Deprecated
  60 public final class ChineseTokenizer extends Tokenizer {
  61
  62
  63     public ChineseTokenizer(Reader in) {
  64       super(in);
  65     }
  66
  67     public ChineseTokenizer(AttributeSource source, Reader in) {
  68       super(source, in);
  69     }
  70
  71     public ChineseTokenizer(AttributeFactory factory, Reader in) {
  72       super(factory, in);
  73     }
  74
  75     private int offset = 0, bufferIndex=0, dataLen=0;
  76     private final static int MAX_WORD_LEN = 255;
  77     private final static int IO_BUFFER_SIZE = 1024;
  78     private final char[] buffer = new char[MAX_WORD_LEN];
  79     private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
  80
  81
  82     private int length;
  83     private int start;
  84
  85     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  86     private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  87
  88     private final void push(char c) {
  89
  90         if (length == 0) start = offset-1;            // start of token
  91         buffer[length++] = Character.toLowerCase(c);  // buffer it
  92
  93     }
  94
  95     private final boolean flush() {
  96
  97         if (length>0) {
  98             //System.out.println(new String(buffer, 0,
  99             //length));
 100           termAtt.copyBuffer(buffer, 0, length);
 101           offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
 102           return true;
 103         }
 104         else
 105             return false;
 106     }
 107
 108     @Override
 109     public boolean incrementToken() throws IOException {
 110         clearAttributes();
 111
 112         length = 0;
 113         start = offset;
 114
 115
 116         while (true) {
 117
 118             final char c;
 119             offset++;
 120
 121             if (bufferIndex >= dataLen) {
 122                 dataLen = input.read(ioBuffer);
 123                 bufferIndex = 0;
 124             }
 125
 126             if (dataLen == -1) {
 127               offset--;
 128               return flush();
 129             } else
 130                 c = ioBuffer[bufferIndex++];
 131
 132
 133             switch(Character.getType(c)) {
 134
 135             case Character.DECIMAL_DIGIT_NUMBER:
 136             case Character.LOWERCASE_LETTER:
 137             case Character.UPPERCASE_LETTER:
 138                 push(c);
 139                 if (length == MAX_WORD_LEN) return flush();
 140                 break;
 141
 142             case Character.OTHER_LETTER:
 143                 if (length>0) {
 144                     bufferIndex--;
 145                     offset--;
 146                     return flush();
 147                 }
 148                 push(c);
 149                 return flush();
 150
 151             default:
 152                 if (length>0) return flush();
 153                 break;
 154             }
 155         }
 156     }
 157
 158     @Override
 159     public final void end() {
 160       // set final offset
 161       final int finalOffset = correctOffset(offset);
 162       this.offsetAtt.setOffset(finalOffset, finalOffset);
 163     }
 164
 165     @Override
 166     public void reset() throws IOException {
 167       super.reset();
 168       offset = bufferIndex = dataLen = 0;
 169     }
 170
 171     @Override
 172     public void reset(Reader input) throws IOException {
 173       super.reset(input);
 174       reset();
 175     }
 176 }