2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 package org.apache.lucene.analysis.cn.smart.hhmm;
21 import java.io.FileInputStream;
22 import java.io.FileNotFoundException;
23 import java.io.FileOutputStream;
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.io.ObjectInputStream;
27 import java.io.ObjectOutputStream;
28 import java.io.RandomAccessFile;
29 import java.io.UnsupportedEncodingException;
30 import java.nio.ByteBuffer;
31 import java.nio.ByteOrder;
33 import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
34 import org.apache.lucene.analysis.cn.smart.Utility;
37 * SmartChineseAnalyzer Word Dictionary
38 * @lucene.experimental
40 class WordDictionary extends AbstractDictionary {
42 private WordDictionary() {
45 private static WordDictionary singleInstance;
48 * Large prime number for hash function
50 public static final int PRIME_INDEX_LENGTH = 12071;
53 * wordIndexTable guarantees to hash all Chinese characters in Unicode into
54 * PRIME_INDEX_LENGTH array. There will be conflict, but in reality this
55 * program only handles the 6768 characters found in GB2312 plus some
56 * ASCII characters. Therefore in order to guarantee better precision, it is
57 * necessary to retain the original symbol in the charIndexTable.
59 private short[] wordIndexTable;
61 private char[] charIndexTable;
64 * To avoid taking too much space, the data structure needed to store the
65 * lexicon requires two multidimensional arrays to store word and frequency.
66 * Each word is placed in a char[]. Each char represents a Chinese char or
67 * other symbol. Each frequency is put into an int. These two arrays
68 * correspond to each other one-to-one. Therefore, one can use
69 * wordItem_charArrayTable[i][j] to look up word from lexicon, and
70 * wordItem_frequencyTable[i][j] to look up the corresponding frequency.
72 private char[][][] wordItem_charArrayTable;
74 private int[][] wordItem_frequencyTable;
76 // static Logger log = Logger.getLogger(WordDictionary.class);
79 * Get the singleton dictionary instance.
82 public synchronized static WordDictionary getInstance() {
83 if (singleInstance == null) {
84 singleInstance = new WordDictionary();
86 singleInstance.load();
87 } catch (IOException e) {
88 String wordDictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
89 singleInstance.load(wordDictRoot);
90 } catch (ClassNotFoundException e) {
91 throw new RuntimeException(e);
94 return singleInstance;
98 * Attempt to load dictionary from provided directory, first trying coredict.mem, failing back on coredict.dct
100 * @param dctFileRoot path to dictionary directory
102 public void load(String dctFileRoot) {
103 String dctFilePath = dctFileRoot + "/coredict.dct";
104 File serialObj = new File(dctFileRoot + "/coredict.mem");
106 if (serialObj.exists() && loadFromObj(serialObj)) {
110 wordIndexTable = new short[PRIME_INDEX_LENGTH];
111 charIndexTable = new char[PRIME_INDEX_LENGTH];
112 for (int i = 0; i < PRIME_INDEX_LENGTH; i++) {
113 charIndexTable[i] = 0;
114 wordIndexTable[i] = -1;
116 wordItem_charArrayTable = new char[GB2312_CHAR_NUM][][];
117 wordItem_frequencyTable = new int[GB2312_CHAR_NUM][];
119 loadMainDataFromFile(dctFilePath);
120 expandDelimiterData();
123 // log.info("load dictionary: " + dctFilePath + " total:" + total);
124 } catch (IOException e) {
125 throw new RuntimeException(e.getMessage());
128 saveToObj(serialObj);
134 * Load coredict.mem internally from the jar file.
136 * @throws ClassNotFoundException
137 * @throws IOException
139 public void load() throws IOException, ClassNotFoundException {
140 InputStream input = this.getClass().getResourceAsStream("coredict.mem");
141 loadFromObjectInputStream(input);
144 private boolean loadFromObj(File serialObj) {
146 loadFromObjectInputStream(new FileInputStream(serialObj));
148 } catch (FileNotFoundException e) {
150 } catch (IOException e) {
152 } catch (ClassNotFoundException e) {
158 private void loadFromObjectInputStream(InputStream serialObjectInputStream)
159 throws IOException, ClassNotFoundException {
160 ObjectInputStream input = new ObjectInputStream(serialObjectInputStream);
161 wordIndexTable = (short[]) input.readObject();
162 charIndexTable = (char[]) input.readObject();
163 wordItem_charArrayTable = (char[][][]) input.readObject();
164 wordItem_frequencyTable = (int[][]) input.readObject();
165 // log.info("load core dict from serialization.");
169 private void saveToObj(File serialObj) {
171 ObjectOutputStream output = new ObjectOutputStream(new FileOutputStream(
173 output.writeObject(wordIndexTable);
174 output.writeObject(charIndexTable);
175 output.writeObject(wordItem_charArrayTable);
176 output.writeObject(wordItem_frequencyTable);
178 // log.info("serialize core dict.");
179 } catch (Exception e) {
180 // log.warn(e.getMessage());
185 * Load the datafile into this WordDictionary
187 * @param dctFilePath path to word dictionary (coredict.dct)
188 * @return number of words read
189 * @throws FileNotFoundException
190 * @throws IOException
191 * @throws UnsupportedEncodingException
193 private int loadMainDataFromFile(String dctFilePath)
194 throws FileNotFoundException, IOException, UnsupportedEncodingException {
195 int i, cnt, length, total = 0;
196 // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
197 // The 3756th is used (as a header) to store information.
198 int[] buffer = new int[3];
199 byte[] intBuffer = new byte[4];
201 RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");
203 // GB2312 characters 0 - 6768
204 for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
206 // System.out.println(i);
208 dctFile.read(intBuffer);
209 // the dictionary was developed for C, and byte order must be converted to work with Java
210 cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
212 wordItem_charArrayTable[i] = null;
213 wordItem_frequencyTable[i] = null;
216 wordItem_charArrayTable[i] = new char[cnt][];
217 wordItem_frequencyTable[i] = new int[cnt];
221 // wordItemTable[i][j] = new WordItem();
222 dctFile.read(intBuffer);
223 buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
224 .getInt();// frequency
225 dctFile.read(intBuffer);
226 buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
228 dctFile.read(intBuffer);
229 buffer[2] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
232 // wordItemTable[i][j].frequency = buffer[0];
233 wordItem_frequencyTable[i][j] = buffer[0];
237 byte[] lchBuffer = new byte[length];
238 dctFile.read(lchBuffer);
239 tmpword = new String(lchBuffer, "GB2312");
240 // indexTable[i].wordItems[j].word = tmpword;
241 // wordItemTable[i][j].charArray = tmpword.toCharArray();
242 wordItem_charArrayTable[i][j] = tmpword.toCharArray();
244 // wordItemTable[i][j].charArray = null;
245 wordItem_charArrayTable[i][j] = null;
247 // System.out.println(indexTable[i].wordItems[j]);
251 String str = getCCByGB2312Id(i);
252 setTableIndex(str.charAt(0), i);
259 * The original lexicon puts all information with punctuation into a
260 * chart (from 1 to 3755). Here it then gets expanded, separately being
261 * placed into the chart that has the corresponding symbol.
263 private void expandDelimiterData() {
266 // Punctuation then treating index 3755 as 1,
267 // distribute the original punctuation corresponding dictionary into
268 int delimiterIndex = 3755 + GB2312_FIRST_CHAR;
270 while (i < wordItem_charArrayTable[delimiterIndex].length) {
271 char c = wordItem_charArrayTable[delimiterIndex][i][0];
272 int j = getGB2312Id(c);// the id value of the punctuation
273 if (wordItem_charArrayTable[j] == null) {
276 // Starting from i, count the number of the following worditem symbol from j
277 while (k < wordItem_charArrayTable[delimiterIndex].length
278 && wordItem_charArrayTable[delimiterIndex][k][0] == c) {
281 // c is the punctuation character, j is the id value of c
282 // k-1 represents the index of the last punctuation character
285 wordItem_charArrayTable[j] = new char[cnt][];
286 wordItem_frequencyTable[j] = new int[cnt];
289 // Assign value for each wordItem.
290 for (k = 0; k < cnt; k++, i++) {
291 // wordItemTable[j][k] = new WordItem();
292 wordItem_frequencyTable[j][k] = wordItem_frequencyTable[delimiterIndex][i];
293 wordItem_charArrayTable[j][k] = new char[wordItem_charArrayTable[delimiterIndex][i].length - 1];
294 System.arraycopy(wordItem_charArrayTable[delimiterIndex][i], 1,
295 wordItem_charArrayTable[j][k], 0,
296 wordItem_charArrayTable[j][k].length);
301 // Delete the original corresponding symbol array.
302 wordItem_charArrayTable[delimiterIndex] = null;
303 wordItem_frequencyTable[delimiterIndex] = null;
307 * since we aren't doing POS-tagging, merge the frequencies for entries of the same word (with different POS)
309 private void mergeSameWords() {
311 for (i = 0; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
312 if (wordItem_charArrayTable[i] == null)
315 for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
316 if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
317 wordItem_charArrayTable[i][j - 1], 0) != 0)
321 if (len < wordItem_charArrayTable[i].length) {
322 char[][] tempArray = new char[len][];
323 int[] tempFreq = new int[len];
325 tempArray[0] = wordItem_charArrayTable[i][0];
326 tempFreq[0] = wordItem_frequencyTable[i][0];
327 for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
328 if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
329 tempArray[k], 0) != 0) {
331 // temp[k] = wordItemTable[i][j];
332 tempArray[k] = wordItem_charArrayTable[i][j];
333 tempFreq[k] = wordItem_frequencyTable[i][j];
335 // temp[k].frequency += wordItemTable[i][j].frequency;
336 tempFreq[k] += wordItem_frequencyTable[i][j];
339 // wordItemTable[i] = temp;
340 wordItem_charArrayTable[i] = tempArray;
341 wordItem_frequencyTable[i] = tempFreq;
346 private void sortEachItems() {
349 for (int i = 0; i < wordItem_charArrayTable.length; i++) {
350 if (wordItem_charArrayTable[i] != null
351 && wordItem_charArrayTable[i].length > 1) {
352 for (int j = 0; j < wordItem_charArrayTable[i].length - 1; j++) {
353 for (int j2 = j + 1; j2 < wordItem_charArrayTable[i].length; j2++) {
354 if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
355 wordItem_charArrayTable[i][j2], 0) > 0) {
356 tmpArray = wordItem_charArrayTable[i][j];
357 tmpFreq = wordItem_frequencyTable[i][j];
358 wordItem_charArrayTable[i][j] = wordItem_charArrayTable[i][j2];
359 wordItem_frequencyTable[i][j] = wordItem_frequencyTable[i][j2];
360 wordItem_charArrayTable[i][j2] = tmpArray;
361 wordItem_frequencyTable[i][j2] = tmpFreq;
370 * Calculate character c's position in hash table,
371 * then initialize the value of that position in the address table.
373 private boolean setTableIndex(char c, int j) {
374 int index = getAvaliableTableIndex(c);
376 charIndexTable[index] = c;
377 wordIndexTable[index] = (short) j;
383 private short getAvaliableTableIndex(char c) {
384 int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
385 int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
387 hash1 = PRIME_INDEX_LENGTH + hash1;
389 hash2 = PRIME_INDEX_LENGTH + hash2;
392 while (charIndexTable[index] != 0 && charIndexTable[index] != c
393 && i < PRIME_INDEX_LENGTH) {
394 index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
397 // System.out.println(i - 1);
399 if (i < PRIME_INDEX_LENGTH
400 && (charIndexTable[index] == 0 || charIndexTable[index] == c)) {
401 return (short) index;
406 private short getWordItemTableIndex(char c) {
407 int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
408 int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
410 hash1 = PRIME_INDEX_LENGTH + hash1;
412 hash2 = PRIME_INDEX_LENGTH + hash2;
415 while (charIndexTable[index] != 0 && charIndexTable[index] != c
416 && i < PRIME_INDEX_LENGTH) {
417 index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
421 if (i < PRIME_INDEX_LENGTH && charIndexTable[index] == c) {
422 return (short) index;
428 * Look up the text string corresponding with the word char array,
429 * and return the position of the word list.
431 * @param knownHashIndex already figure out position of the first word
432 * symbol charArray[0] in hash table. If not calculated yet, can be
433 * replaced with function int findInTable(char[] charArray).
434 * @param charArray look up the char array corresponding with the word.
435 * @return word location in word array. If not found, then return -1.
437 private int findInTable(short knownHashIndex, char[] charArray) {
438 if (charArray == null || charArray.length == 0)
441 char[][] items = wordItem_charArrayTable[wordIndexTable[knownHashIndex]];
442 int start = 0, end = items.length - 1;
443 int mid = (start + end) / 2, cmpResult;
445 // Binary search for the index of idArray
446 while (start <= end) {
447 cmpResult = Utility.compareArray(items[mid], 0, charArray, 1);
450 return mid;// find it
451 else if (cmpResult < 0)
453 else if (cmpResult > 0)
456 mid = (start + end) / 2;
462 * Find the first word in the dictionary that starts with the supplied prefix
464 * @see #getPrefixMatch(char[], int)
465 * @param charArray input prefix
466 * @return index of word, or -1 if not found
468 public int getPrefixMatch(char[] charArray) {
469 return getPrefixMatch(charArray, 0);
473 * Find the nth word in the dictionary that starts with the supplied prefix
475 * @see #getPrefixMatch(char[])
476 * @param charArray input prefix
477 * @param knownStart relative position in the dictionary to start
478 * @return index of word, or -1 if not found
480 public int getPrefixMatch(char[] charArray, int knownStart) {
481 short index = getWordItemTableIndex(charArray[0]);
484 char[][] items = wordItem_charArrayTable[wordIndexTable[index]];
485 int start = knownStart, end = items.length - 1;
487 int mid = (start + end) / 2, cmpResult;
489 // Binary search for the index of idArray
490 while (start <= end) {
491 cmpResult = Utility.compareArrayByPrefix(charArray, 1, items[mid], 0);
492 if (cmpResult == 0) {
493 // Get the first item which match the current word
495 && Utility.compareArrayByPrefix(charArray, 1, items[mid], 0) == 0)
498 return mid;// Find the first word that uses charArray as prefix.
499 } else if (cmpResult < 0)
503 mid = (start + end) / 2;
509 * Get the frequency of a word from the dictionary
511 * @param charArray input word
512 * @return word frequency, or zero if the word is not found
514 public int getFrequency(char[] charArray) {
515 short hashIndex = getWordItemTableIndex(charArray[0]);
518 int itemIndex = findInTable(hashIndex, charArray);
520 return wordItem_frequencyTable[wordIndexTable[hashIndex]][itemIndex];
526 * Return true if the dictionary entry at itemIndex for table charArray[0] is charArray
528 * @param charArray input word
529 * @param itemIndex item index for table charArray[0]
530 * @return true if the entry exists
532 public boolean isEqual(char[] charArray, int itemIndex) {
533 short hashIndex = getWordItemTableIndex(charArray[0]);
534 return Utility.compareArray(charArray, 1,
535 wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0;