X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java?ds=sidebyside diff --git a/lucene-java-3.5.0/lucene/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java b/lucene-java-3.5.0/lucene/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java new file mode 100644 index 0000000..663a6b5 --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java @@ -0,0 +1,210 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.cn.smart.hhmm; + +import java.io.UnsupportedEncodingException; + +/** + *

+ * SmartChineseAnalyzer abstract dictionary implementation. + *

+ *

+ * Contains methods for dealing with GB2312 encoding. + *

+ * @lucene.experimental + */ +abstract class AbstractDictionary { + /** + * First Chinese Character in GB2312 (15 * 94) + * Characters in GB2312 are arranged in a grid of 94 * 94, 0-14 are unassigned or punctuation. + */ + public static final int GB2312_FIRST_CHAR = 1410; + + /** + * Last Chinese Character in GB2312 (87 * 94). + * Characters in GB2312 are arranged in a grid of 94 * 94, 88-94 are unassigned. + */ + public static final int GB2312_CHAR_NUM = 87 * 94; + + /** + * Dictionary data contains 6768 Chinese characters with frequency statistics. + */ + public static final int CHAR_NUM_IN_FILE = 6768; + + // ===================================================== + // code +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +A +B +C +D +E +F + // B0A0 啊 阿 埃 挨 哎 唉 哀 皑 癌 蔼 矮 艾 碍 爱 隘 + // B0B0 鞍 氨 安 俺 按 暗 岸 胺 案 肮 昂 盎 凹 敖 熬 翱 + // B0C0 袄 傲 奥 懊 澳 芭 捌 扒 叭 吧 笆 八 疤 巴 拔 跋 + // B0D0 靶 把 耙 坝 霸 罢 爸 白 柏 百 摆 佰 败 拜 稗 斑 + // B0E0 班 搬 扳 般 颁 板 版 扮 拌 伴 瓣 半 办 绊 邦 帮 + // B0F0 梆 榜 膀 绑 棒 磅 蚌 镑 傍 谤 苞 胞 包 褒 剥 + // ===================================================== + // + // GB2312 character set: + // 01 94 Symbols + // 02 72 Numbers + // 03 94 Latin + // 04 83 Kana + // 05 86 Katakana + // 06 48 Greek + // 07 66 Cyrillic + // 08 63 Phonetic Symbols + // 09 76 Drawing Symbols + // 10-15 Unassigned + // 16-55 3755 Plane 1, in pinyin order + // 56-87 3008 Plane 2, in radical/stroke order + // 88-94 Unassigned + // ====================================================== + + /** + *

+ * Transcode from GB2312 ID to Unicode + *

+ *

+ * GB2312 is divided into a 94 * 94 grid, containing 7445 characters consisting of 6763 Chinese characters and 682 symbols. + * Some regions are unassigned (reserved). + *

+ * + * @param ccid GB2312 id + * @return unicode String + */ + public String getCCByGB2312Id(int ccid) { + if (ccid < 0 || ccid > WordDictionary.GB2312_CHAR_NUM) + return ""; + int cc1 = ccid / 94 + 161; + int cc2 = ccid % 94 + 161; + byte[] buffer = new byte[2]; + buffer[0] = (byte) cc1; + buffer[1] = (byte) cc2; + try { + String cchar = new String(buffer, "GB2312"); + return cchar; + } catch (UnsupportedEncodingException e) { + return ""; + } + } + + /** + * Transcode from Unicode to GB2312 + * + * @param ch input character in Unicode, or character in Basic Latin range. + * @return position in GB2312 + */ + public short getGB2312Id(char ch) { + try { + byte[] buffer = Character.toString(ch).getBytes("GB2312"); + if (buffer.length != 2) { + // Should be a two-byte character + return -1; + } + int b0 = (buffer[0] & 0x0FF) - 161; // Code starts from A1, therefore subtract 0xA1=161 + int b1 = (buffer[1] & 0x0FF) - 161; // There is no Chinese char for the first and last symbol. + // Therefore, each code page only has 16*6-2=94 characters. + return (short) (b0 * 94 + b1); + } catch (UnsupportedEncodingException e) { + e.printStackTrace(); + } + return -1; + } + + /** + * 32-bit FNV Hash Function + * + * @param c input character + * @return hashcode + */ + public long hash1(char c) { + final long p = 1099511628211L; + long hash = 0xcbf29ce484222325L; + hash = (hash ^ (c & 0x00FF)) * p; + hash = (hash ^ (c >> 8)) * p; + hash += hash << 13; + hash ^= hash >> 7; + hash += hash << 3; + hash ^= hash >> 17; + hash += hash << 5; + return hash; + } + + /** + * 32-bit FNV Hash Function + * + * @param carray character array + * @return hashcode + */ + public long hash1(char carray[]) { + final long p = 1099511628211L; + long hash = 0xcbf29ce484222325L; + for (int i = 0; i < carray.length; i++) { + char d = carray[i]; + hash = (hash ^ (d & 0x00FF)) * p; + hash = (hash ^ (d >> 8)) * p; + } + + // hash += hash << 13; + // hash ^= hash >> 7; + // hash += hash << 3; + // hash ^= hash >> 17; + // hash += hash << 5; + return hash; + } + + /** + * djb2 hash algorithm,this algorithm (k=33) was first reported by dan + * bernstein many years ago in comp.lang.c. another version of this algorithm + * (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ str[i]; + * the magic of number 33 (why it works better than many other constants, + * prime or not) has never been adequately explained. + * + * @param c character + * @return hashcode + */ + public int hash2(char c) { + int hash = 5381; + + /* hash 33 + c */ + hash = ((hash << 5) + hash) + c & 0x00FF; + hash = ((hash << 5) + hash) + c >> 8; + + return hash; + } + + /** + * djb2 hash algorithm,this algorithm (k=33) was first reported by dan + * bernstein many years ago in comp.lang.c. another version of this algorithm + * (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ str[i]; + * the magic of number 33 (why it works better than many other constants, + * prime or not) has never been adequately explained. + * + * @param carray character array + * @return hashcode + */ + public int hash2(char carray[]) { + int hash = 5381; + + /* hash 33 + c */ + for (int i = 0; i < carray.length; i++) { + char d = carray[i]; + hash = ((hash << 5) + hash) + d & 0x00FF; + hash = ((hash << 5) + hash) + d >> 8; + } + + return hash; + } +}