2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 package org.apache.lucene.analysis.cn.smart.hhmm;
21 import java.io.FileInputStream;
22 import java.io.FileNotFoundException;
23 import java.io.FileOutputStream;
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.io.ObjectInputStream;
27 import java.io.ObjectOutputStream;
28 import java.io.RandomAccessFile;
29 import java.io.UnsupportedEncodingException;
30 import java.nio.ByteBuffer;
31 import java.nio.ByteOrder;
33 import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
36 * SmartChineseAnalyzer Bigram dictionary.
37 * @lucene.experimental
39 class BigramDictionary extends AbstractDictionary {
41 private BigramDictionary() {
44 public static final char WORD_SEGMENT_CHAR = '@';
46 private static BigramDictionary singleInstance;
48 public static final int PRIME_BIGRAM_LENGTH = 402137;
51 * The word associations are stored as FNV1 hashcodes, which have a small probability of collision, but save memory.
53 private long[] bigramHashTable;
55 private int[] frequencyTable;
59 private int repeat = 0;
61 // static Logger log = Logger.getLogger(BigramDictionary.class);
63 public synchronized static BigramDictionary getInstance() {
64 if (singleInstance == null) {
65 singleInstance = new BigramDictionary();
67 singleInstance.load();
68 } catch (IOException e) {
69 String dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
70 singleInstance.load(dictRoot);
71 } catch (ClassNotFoundException e) {
72 throw new RuntimeException(e);
75 return singleInstance;
78 private boolean loadFromObj(File serialObj) {
80 loadFromInputStream(new FileInputStream(serialObj));
82 } catch (FileNotFoundException e) {
84 } catch (IOException e) {
86 } catch (ClassNotFoundException e) {
92 private void loadFromInputStream(InputStream serialObjectInputStream)
93 throws IOException, ClassNotFoundException {
94 ObjectInputStream input = new ObjectInputStream(serialObjectInputStream);
95 bigramHashTable = (long[]) input.readObject();
96 frequencyTable = (int[]) input.readObject();
97 // log.info("load bigram dict from serialization.");
101 private void saveToObj(File serialObj) {
103 ObjectOutputStream output = new ObjectOutputStream(new FileOutputStream(
105 output.writeObject(bigramHashTable);
106 output.writeObject(frequencyTable);
108 // log.info("serialize bigram dict.");
109 } catch (Exception e) {
110 // log.warn(e.getMessage());
114 private void load() throws IOException, ClassNotFoundException {
115 InputStream input = this.getClass().getResourceAsStream("bigramdict.mem");
116 loadFromInputStream(input);
119 private void load(String dictRoot) {
120 String bigramDictPath = dictRoot + "/bigramdict.dct";
122 File serialObj = new File(dictRoot + "/bigramdict.mem");
124 if (serialObj.exists() && loadFromObj(serialObj)) {
128 bigramHashTable = new long[PRIME_BIGRAM_LENGTH];
129 frequencyTable = new int[PRIME_BIGRAM_LENGTH];
130 for (int i = 0; i < PRIME_BIGRAM_LENGTH; i++) {
131 // it is possible for a value to hash to 0, but the probability is extremely low
132 bigramHashTable[i] = 0;
133 frequencyTable[i] = 0;
135 loadFromFile(bigramDictPath);
136 } catch (IOException e) {
137 throw new RuntimeException(e.getMessage());
139 saveToObj(serialObj);
144 * Load the datafile into this BigramDictionary
146 * @param dctFilePath path to the Bigramdictionary (bigramdict.dct)
147 * @throws FileNotFoundException
148 * @throws IOException
149 * @throws UnsupportedEncodingException
151 public void loadFromFile(String dctFilePath) throws FileNotFoundException,
152 IOException, UnsupportedEncodingException {
154 int i, cnt, length, total = 0;
155 // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
156 // The 3756th is used (as a header) to store information.
157 int[] buffer = new int[3];
158 byte[] intBuffer = new byte[4];
160 RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");
162 // GB2312 characters 0 - 6768
163 for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
164 String currentStr = getCCByGB2312Id(i);
166 // System.out.println(i);
168 dctFile.read(intBuffer);
169 // the dictionary was developed for C, and byte order must be converted to work with Java
170 cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
177 dctFile.read(intBuffer);
178 buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
179 .getInt();// frequency
180 dctFile.read(intBuffer);
181 buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
183 dctFile.read(intBuffer);
184 // buffer[2] = ByteBuffer.wrap(intBuffer).order(
185 // ByteOrder.LITTLE_ENDIAN).getInt();// handle
189 byte[] lchBuffer = new byte[length];
190 dctFile.read(lchBuffer);
191 tmpword = new String(lchBuffer, "GB2312");
192 if (i != 3755 + GB2312_FIRST_CHAR) {
193 tmpword = currentStr + tmpword;
195 char carray[] = tmpword.toCharArray();
196 long hashId = hash1(carray);
197 int index = getAvaliableIndex(hashId, carray);
199 if (bigramHashTable[index] == 0) {
200 bigramHashTable[index] = hashId;
201 // bigramStringTable[index] = tmpword;
203 frequencyTable[index] += buffer[0];
210 // log.info("load dictionary done! " + dctFilePath + " total:" + total);
213 private int getAvaliableIndex(long hashId, char carray[]) {
214 int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH);
215 int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH;
217 hash1 = PRIME_BIGRAM_LENGTH + hash1;
219 hash2 = PRIME_BIGRAM_LENGTH + hash2;
222 while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
223 && i < PRIME_BIGRAM_LENGTH) {
224 index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
227 // System.out.println(i - 1);
229 if (i < PRIME_BIGRAM_LENGTH
230 && (bigramHashTable[index] == 0 || bigramHashTable[index] == hashId)) {
237 * lookup the index into the frequency array.
239 private int getBigramItemIndex(char carray[]) {
240 long hashId = hash1(carray);
241 int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH);
242 int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH;
244 hash1 = PRIME_BIGRAM_LENGTH + hash1;
246 hash2 = PRIME_BIGRAM_LENGTH + hash2;
250 while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
251 && i < PRIME_BIGRAM_LENGTH) {
252 index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
258 // System.out.println(i - 1);
260 if (i < PRIME_BIGRAM_LENGTH && bigramHashTable[index] == hashId) {
266 public int getFrequency(char[] carray) {
267 int index = getBigramItemIndex(carray);
269 return frequencyTable[index];