X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java?ds=inline
diff --git a/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
new file mode 100644
index 0000000..d907d51
--- /dev/null
+++ b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
@@ -0,0 +1,315 @@
+package org.apache.lucene.analysis.cjk;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * CJKTokenizer is designed for Chinese, Japanese, and Korean languages.
+ *
+ * The tokens returned are every two adjacent characters with overlap match.
+ *
+ *
+ * Example: "java C1C2C3C4" will be segmented to: "java" "C1C2" "C2C3" "C3C4".
+ *
+ * Additionally, the following is applied to Latin text (such as English):
+ *
+ * - Text is converted to lowercase.
+ *
- Numeric digits, '+', '#', and '_' are tokenized as letters.
+ *
- Full-width forms are converted to half-width forms.
+ *
+ * For more info on Asian language (Chinese, Japanese, and Korean) text segmentation:
+ * please search google
+ *
+ */
+public final class CJKTokenizer extends Tokenizer {
+ //~ Static fields/initializers ---------------------------------------------
+ /** Word token type */
+ static final int WORD_TYPE = 0;
+
+ /** Single byte token type */
+ static final int SINGLE_TOKEN_TYPE = 1;
+
+ /** Double byte token type */
+ static final int DOUBLE_TOKEN_TYPE = 2;
+
+ /** Names for token types */
+ static final String[] TOKEN_TYPE_NAMES = { "word", "single", "double" };
+
+ /** Max word length */
+ private static final int MAX_WORD_LEN = 255;
+
+ /** buffer size: */
+ private static final int IO_BUFFER_SIZE = 256;
+
+ //~ Instance fields --------------------------------------------------------
+
+ /** word offset, used to imply which character(in ) is parsed */
+ private int offset = 0;
+
+ /** the index used only for ioBuffer */
+ private int bufferIndex = 0;
+
+ /** data length */
+ private int dataLen = 0;
+
+ /**
+ * character buffer, store the characters which are used to compose
+ * the returned Token
+ */
+ private final char[] buffer = new char[MAX_WORD_LEN];
+
+ /**
+ * I/O buffer, used to store the content of the input(one of the
+ * members of Tokenizer)
+ */
+ private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+ /** word type: single=>ASCII double=>non-ASCII word=>default */
+ private int tokenType = WORD_TYPE;
+
+ /**
+ * tag: previous character is a cached double-byte character "C1C2C3C4"
+ * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
+ * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
+ */
+ private boolean preIsTokened = false;
+
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+
+ //~ Constructors -----------------------------------------------------------
+
+ /**
+ * Construct a token stream processing the given input.
+ *
+ * @param in I/O reader
+ */
+ public CJKTokenizer(Reader in) {
+ super(in);
+ }
+
+ public CJKTokenizer(AttributeSource source, Reader in) {
+ super(source, in);
+ }
+
+ public CJKTokenizer(AttributeFactory factory, Reader in) {
+ super(factory, in);
+ }
+
+ //~ Methods ----------------------------------------------------------------
+
+ /**
+ * Returns true for the next token in the stream, or false at EOS.
+ * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
+ * for detail.
+ *
+ * @return false for end of stream, true otherwise
+ *
+ * @throws java.io.IOException - throw IOException when read error
+ * happened in the InputStream
+ *
+ */
+ @Override
+ public boolean incrementToken() throws IOException {
+ clearAttributes();
+ /** how many character(s) has been stored in buffer */
+
+ while(true) { // loop until we find a non-empty token
+
+ int length = 0;
+
+ /** the position used to create Token */
+ int start = offset;
+
+ while (true) { // loop until we've found a full token
+ /** current character */
+ char c;
+
+ /** unicode block of current character for detail */
+ Character.UnicodeBlock ub;
+
+ offset++;
+
+ if (bufferIndex >= dataLen) {
+ dataLen = input.read(ioBuffer);
+ bufferIndex = 0;
+ }
+
+ if (dataLen == -1) {
+ if (length > 0) {
+ if (preIsTokened == true) {
+ length = 0;
+ preIsTokened = false;
+ }
+ else{
+ offset--;
+ }
+
+ break;
+ } else {
+ offset--;
+ return false;
+ }
+ } else {
+ //get current character
+ c = ioBuffer[bufferIndex++];
+
+ //get the UnicodeBlock of the current character
+ ub = Character.UnicodeBlock.of(c);
+ }
+
+ //if the current character is ASCII or Extend ASCII
+ if ((ub == Character.UnicodeBlock.BASIC_LATIN)
+ || (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)
+ ) {
+ if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
+ int i = (int) c;
+ if (i >= 65281 && i <= 65374) {
+ // convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
+ i = i - 65248;
+ c = (char) i;
+ }
+ }
+
+ // if the current character is a letter or "_" "+" "#"
+ if (Character.isLetterOrDigit(c)
+ || ((c == '_') || (c == '+') || (c == '#'))
+ ) {
+ if (length == 0) {
+ // "javaC1C2C3C4linux"
+ // ^--: the current character begin to token the ASCII
+ // letter
+ start = offset - 1;
+ } else if (tokenType == DOUBLE_TOKEN_TYPE) {
+ // "javaC1C2C3C4linux"
+ // ^--: the previous non-ASCII
+ // : the current character
+ offset--;
+ bufferIndex--;
+
+ if (preIsTokened == true) {
+ // there is only one non-ASCII has been stored
+ length = 0;
+ preIsTokened = false;
+ break;
+ } else {
+ break;
+ }
+ }
+
+ // store the LowerCase(c) in the buffer
+ buffer[length++] = Character.toLowerCase(c);
+ tokenType = SINGLE_TOKEN_TYPE;
+
+ // break the procedure if buffer overflowed!
+ if (length == MAX_WORD_LEN) {
+ break;
+ }
+ } else if (length > 0) {
+ if (preIsTokened == true) {
+ length = 0;
+ preIsTokened = false;
+ } else {
+ break;
+ }
+ }
+ } else {
+ // non-ASCII letter, e.g."C1C2C3C4"
+ if (Character.isLetter(c)) {
+ if (length == 0) {
+ start = offset - 1;
+ buffer[length++] = c;
+ tokenType = DOUBLE_TOKEN_TYPE;
+ } else {
+ if (tokenType == SINGLE_TOKEN_TYPE) {
+ offset--;
+ bufferIndex--;
+
+ //return the previous ASCII characters
+ break;
+ } else {
+ buffer[length++] = c;
+ tokenType = DOUBLE_TOKEN_TYPE;
+
+ if (length == 2) {
+ offset--;
+ bufferIndex--;
+ preIsTokened = true;
+
+ break;
+ }
+ }
+ }
+ } else if (length > 0) {
+ if (preIsTokened == true) {
+ // empty the buffer
+ length = 0;
+ preIsTokened = false;
+ } else {
+ break;
+ }
+ }
+ }
+ }
+
+ if (length > 0) {
+ termAtt.copyBuffer(buffer, 0, length);
+ offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
+ typeAtt.setType(TOKEN_TYPE_NAMES[tokenType]);
+ return true;
+ } else if (dataLen == -1) {
+ offset--;
+ return false;
+ }
+
+ // Cycle back and try for the next token (don't
+ // return an empty string)
+ }
+ }
+
+ @Override
+ public final void end() {
+ // set final offset
+ final int finalOffset = correctOffset(offset);
+ this.offsetAtt.setOffset(finalOffset, finalOffset);
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ offset = bufferIndex = dataLen = 0;
+ preIsTokened = false;
+ tokenType = WORD_TYPE;
+ }
+
+ @Override
+ public void reset(Reader reader) throws IOException {
+ super.reset(reader);
+ reset();
+ }
+}