+++ /dev/null
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.analysis.cn.smart.hhmm;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.ObjectInputStream;
-import java.io.ObjectOutputStream;
-import java.io.RandomAccessFile;
-import java.io.UnsupportedEncodingException;
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-
-import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
-import org.apache.lucene.analysis.cn.smart.Utility;
-
-/**
- * SmartChineseAnalyzer Word Dictionary
- * @lucene.experimental
- */
-class WordDictionary extends AbstractDictionary {
-
- private WordDictionary() {
- }
-
- private static WordDictionary singleInstance;
-
- /**
- * Large prime number for hash function
- */
- public static final int PRIME_INDEX_LENGTH = 12071;
-
- /**
- * wordIndexTable guarantees to hash all Chinese characters in Unicode into
- * PRIME_INDEX_LENGTH array. There will be conflict, but in reality this
- * program only handles the 6768 characters found in GB2312 plus some
- * ASCII characters. Therefore in order to guarantee better precision, it is
- * necessary to retain the original symbol in the charIndexTable.
- */
- private short[] wordIndexTable;
-
- private char[] charIndexTable;
-
- /**
- * To avoid taking too much space, the data structure needed to store the
- * lexicon requires two multidimensional arrays to store word and frequency.
- * Each word is placed in a char[]. Each char represents a Chinese char or
- * other symbol. Each frequency is put into an int. These two arrays
- * correspond to each other one-to-one. Therefore, one can use
- * wordItem_charArrayTable[i][j] to look up word from lexicon, and
- * wordItem_frequencyTable[i][j] to look up the corresponding frequency.
- */
- private char[][][] wordItem_charArrayTable;
-
- private int[][] wordItem_frequencyTable;
-
- // static Logger log = Logger.getLogger(WordDictionary.class);
-
- /**
- * Get the singleton dictionary instance.
- * @return singleton
- */
- public synchronized static WordDictionary getInstance() {
- if (singleInstance == null) {
- singleInstance = new WordDictionary();
- try {
- singleInstance.load();
- } catch (IOException e) {
- String wordDictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
- singleInstance.load(wordDictRoot);
- } catch (ClassNotFoundException e) {
- throw new RuntimeException(e);
- }
- }
- return singleInstance;
- }
-
- /**
- * Attempt to load dictionary from provided directory, first trying coredict.mem, failing back on coredict.dct
- *
- * @param dctFileRoot path to dictionary directory
- */
- public void load(String dctFileRoot) {
- String dctFilePath = dctFileRoot + "/coredict.dct";
- File serialObj = new File(dctFileRoot + "/coredict.mem");
-
- if (serialObj.exists() && loadFromObj(serialObj)) {
-
- } else {
- try {
- wordIndexTable = new short[PRIME_INDEX_LENGTH];
- charIndexTable = new char[PRIME_INDEX_LENGTH];
- for (int i = 0; i < PRIME_INDEX_LENGTH; i++) {
- charIndexTable[i] = 0;
- wordIndexTable[i] = -1;
- }
- wordItem_charArrayTable = new char[GB2312_CHAR_NUM][][];
- wordItem_frequencyTable = new int[GB2312_CHAR_NUM][];
- // int total =
- loadMainDataFromFile(dctFilePath);
- expandDelimiterData();
- mergeSameWords();
- sortEachItems();
- // log.info("load dictionary: " + dctFilePath + " total:" + total);
- } catch (IOException e) {
- throw new RuntimeException(e.getMessage());
- }
-
- saveToObj(serialObj);
- }
-
- }
-
- /**
- * Load coredict.mem internally from the jar file.
- *
- * @throws ClassNotFoundException
- * @throws IOException
- */
- public void load() throws IOException, ClassNotFoundException {
- InputStream input = this.getClass().getResourceAsStream("coredict.mem");
- loadFromObjectInputStream(input);
- }
-
- private boolean loadFromObj(File serialObj) {
- try {
- loadFromObjectInputStream(new FileInputStream(serialObj));
- return true;
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } catch (ClassNotFoundException e) {
- e.printStackTrace();
- }
- return false;
- }
-
- private void loadFromObjectInputStream(InputStream serialObjectInputStream)
- throws IOException, ClassNotFoundException {
- ObjectInputStream input = new ObjectInputStream(serialObjectInputStream);
- wordIndexTable = (short[]) input.readObject();
- charIndexTable = (char[]) input.readObject();
- wordItem_charArrayTable = (char[][][]) input.readObject();
- wordItem_frequencyTable = (int[][]) input.readObject();
- // log.info("load core dict from serialization.");
- input.close();
- }
-
- private void saveToObj(File serialObj) {
- try {
- ObjectOutputStream output = new ObjectOutputStream(new FileOutputStream(
- serialObj));
- output.writeObject(wordIndexTable);
- output.writeObject(charIndexTable);
- output.writeObject(wordItem_charArrayTable);
- output.writeObject(wordItem_frequencyTable);
- output.close();
- // log.info("serialize core dict.");
- } catch (Exception e) {
- // log.warn(e.getMessage());
- }
- }
-
- /**
- * Load the datafile into this WordDictionary
- *
- * @param dctFilePath path to word dictionary (coredict.dct)
- * @return number of words read
- * @throws FileNotFoundException
- * @throws IOException
- * @throws UnsupportedEncodingException
- */
- private int loadMainDataFromFile(String dctFilePath)
- throws FileNotFoundException, IOException, UnsupportedEncodingException {
- int i, cnt, length, total = 0;
- // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
- // The 3756th is used (as a header) to store information.
- int[] buffer = new int[3];
- byte[] intBuffer = new byte[4];
- String tmpword;
- RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");
-
- // GB2312 characters 0 - 6768
- for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
- // if (i == 5231)
- // System.out.println(i);
-
- dctFile.read(intBuffer);
- // the dictionary was developed for C, and byte order must be converted to work with Java
- cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
- if (cnt <= 0) {
- wordItem_charArrayTable[i] = null;
- wordItem_frequencyTable[i] = null;
- continue;
- }
- wordItem_charArrayTable[i] = new char[cnt][];
- wordItem_frequencyTable[i] = new int[cnt];
- total += cnt;
- int j = 0;
- while (j < cnt) {
- // wordItemTable[i][j] = new WordItem();
- dctFile.read(intBuffer);
- buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
- .getInt();// frequency
- dctFile.read(intBuffer);
- buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
- .getInt();// length
- dctFile.read(intBuffer);
- buffer[2] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
- .getInt();// handle
-
- // wordItemTable[i][j].frequency = buffer[0];
- wordItem_frequencyTable[i][j] = buffer[0];
-
- length = buffer[1];
- if (length > 0) {
- byte[] lchBuffer = new byte[length];
- dctFile.read(lchBuffer);
- tmpword = new String(lchBuffer, "GB2312");
- // indexTable[i].wordItems[j].word = tmpword;
- // wordItemTable[i][j].charArray = tmpword.toCharArray();
- wordItem_charArrayTable[i][j] = tmpword.toCharArray();
- } else {
- // wordItemTable[i][j].charArray = null;
- wordItem_charArrayTable[i][j] = null;
- }
- // System.out.println(indexTable[i].wordItems[j]);
- j++;
- }
-
- String str = getCCByGB2312Id(i);
- setTableIndex(str.charAt(0), i);
- }
- dctFile.close();
- return total;
- }
-
- /**
- * The original lexicon puts all information with punctuation into a
- * chart (from 1 to 3755). Here it then gets expanded, separately being
- * placed into the chart that has the corresponding symbol.
- */
- private void expandDelimiterData() {
- int i;
- int cnt;
- // Punctuation then treating index 3755 as 1,
- // distribute the original punctuation corresponding dictionary into
- int delimiterIndex = 3755 + GB2312_FIRST_CHAR;
- i = 0;
- while (i < wordItem_charArrayTable[delimiterIndex].length) {
- char c = wordItem_charArrayTable[delimiterIndex][i][0];
- int j = getGB2312Id(c);// the id value of the punctuation
- if (wordItem_charArrayTable[j] == null) {
-
- int k = i;
- // Starting from i, count the number of the following worditem symbol from j
- while (k < wordItem_charArrayTable[delimiterIndex].length
- && wordItem_charArrayTable[delimiterIndex][k][0] == c) {
- k++;
- }
- // c is the punctuation character, j is the id value of c
- // k-1 represents the index of the last punctuation character
- cnt = k - i;
- if (cnt != 0) {
- wordItem_charArrayTable[j] = new char[cnt][];
- wordItem_frequencyTable[j] = new int[cnt];
- }
-
- // Assign value for each wordItem.
- for (k = 0; k < cnt; k++, i++) {
- // wordItemTable[j][k] = new WordItem();
- wordItem_frequencyTable[j][k] = wordItem_frequencyTable[delimiterIndex][i];
- wordItem_charArrayTable[j][k] = new char[wordItem_charArrayTable[delimiterIndex][i].length - 1];
- System.arraycopy(wordItem_charArrayTable[delimiterIndex][i], 1,
- wordItem_charArrayTable[j][k], 0,
- wordItem_charArrayTable[j][k].length);
- }
- setTableIndex(c, j);
- }
- }
- // Delete the original corresponding symbol array.
- wordItem_charArrayTable[delimiterIndex] = null;
- wordItem_frequencyTable[delimiterIndex] = null;
- }
-
- /*
- * since we aren't doing POS-tagging, merge the frequencies for entries of the same word (with different POS)
- */
- private void mergeSameWords() {
- int i;
- for (i = 0; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
- if (wordItem_charArrayTable[i] == null)
- continue;
- int len = 1;
- for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
- if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
- wordItem_charArrayTable[i][j - 1], 0) != 0)
- len++;
-
- }
- if (len < wordItem_charArrayTable[i].length) {
- char[][] tempArray = new char[len][];
- int[] tempFreq = new int[len];
- int k = 0;
- tempArray[0] = wordItem_charArrayTable[i][0];
- tempFreq[0] = wordItem_frequencyTable[i][0];
- for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
- if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
- tempArray[k], 0) != 0) {
- k++;
- // temp[k] = wordItemTable[i][j];
- tempArray[k] = wordItem_charArrayTable[i][j];
- tempFreq[k] = wordItem_frequencyTable[i][j];
- } else {
- // temp[k].frequency += wordItemTable[i][j].frequency;
- tempFreq[k] += wordItem_frequencyTable[i][j];
- }
- }
- // wordItemTable[i] = temp;
- wordItem_charArrayTable[i] = tempArray;
- wordItem_frequencyTable[i] = tempFreq;
- }
- }
- }
-
- private void sortEachItems() {
- char[] tmpArray;
- int tmpFreq;
- for (int i = 0; i < wordItem_charArrayTable.length; i++) {
- if (wordItem_charArrayTable[i] != null
- && wordItem_charArrayTable[i].length > 1) {
- for (int j = 0; j < wordItem_charArrayTable[i].length - 1; j++) {
- for (int j2 = j + 1; j2 < wordItem_charArrayTable[i].length; j2++) {
- if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
- wordItem_charArrayTable[i][j2], 0) > 0) {
- tmpArray = wordItem_charArrayTable[i][j];
- tmpFreq = wordItem_frequencyTable[i][j];
- wordItem_charArrayTable[i][j] = wordItem_charArrayTable[i][j2];
- wordItem_frequencyTable[i][j] = wordItem_frequencyTable[i][j2];
- wordItem_charArrayTable[i][j2] = tmpArray;
- wordItem_frequencyTable[i][j2] = tmpFreq;
- }
- }
- }
- }
- }
- }
-
- /*
- * Calculate character c's position in hash table,
- * then initialize the value of that position in the address table.
- */
- private boolean setTableIndex(char c, int j) {
- int index = getAvaliableTableIndex(c);
- if (index != -1) {
- charIndexTable[index] = c;
- wordIndexTable[index] = (short) j;
- return true;
- } else
- return false;
- }
-
- private short getAvaliableTableIndex(char c) {
- int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
- int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
- if (hash1 < 0)
- hash1 = PRIME_INDEX_LENGTH + hash1;
- if (hash2 < 0)
- hash2 = PRIME_INDEX_LENGTH + hash2;
- int index = hash1;
- int i = 1;
- while (charIndexTable[index] != 0 && charIndexTable[index] != c
- && i < PRIME_INDEX_LENGTH) {
- index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
- i++;
- }
- // System.out.println(i - 1);
-
- if (i < PRIME_INDEX_LENGTH
- && (charIndexTable[index] == 0 || charIndexTable[index] == c)) {
- return (short) index;
- } else
- return -1;
- }
-
- private short getWordItemTableIndex(char c) {
- int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
- int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
- if (hash1 < 0)
- hash1 = PRIME_INDEX_LENGTH + hash1;
- if (hash2 < 0)
- hash2 = PRIME_INDEX_LENGTH + hash2;
- int index = hash1;
- int i = 1;
- while (charIndexTable[index] != 0 && charIndexTable[index] != c
- && i < PRIME_INDEX_LENGTH) {
- index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
- i++;
- }
-
- if (i < PRIME_INDEX_LENGTH && charIndexTable[index] == c) {
- return (short) index;
- } else
- return -1;
- }
-
- /**
- * Look up the text string corresponding with the word char array,
- * and return the position of the word list.
- *
- * @param knownHashIndex already figure out position of the first word
- * symbol charArray[0] in hash table. If not calculated yet, can be
- * replaced with function int findInTable(char[] charArray).
- * @param charArray look up the char array corresponding with the word.
- * @return word location in word array. If not found, then return -1.
- */
- private int findInTable(short knownHashIndex, char[] charArray) {
- if (charArray == null || charArray.length == 0)
- return -1;
-
- char[][] items = wordItem_charArrayTable[wordIndexTable[knownHashIndex]];
- int start = 0, end = items.length - 1;
- int mid = (start + end) / 2, cmpResult;
-
- // Binary search for the index of idArray
- while (start <= end) {
- cmpResult = Utility.compareArray(items[mid], 0, charArray, 1);
-
- if (cmpResult == 0)
- return mid;// find it
- else if (cmpResult < 0)
- start = mid + 1;
- else if (cmpResult > 0)
- end = mid - 1;
-
- mid = (start + end) / 2;
- }
- return -1;
- }
-
- /**
- * Find the first word in the dictionary that starts with the supplied prefix
- *
- * @see #getPrefixMatch(char[], int)
- * @param charArray input prefix
- * @return index of word, or -1 if not found
- */
- public int getPrefixMatch(char[] charArray) {
- return getPrefixMatch(charArray, 0);
- }
-
- /**
- * Find the nth word in the dictionary that starts with the supplied prefix
- *
- * @see #getPrefixMatch(char[])
- * @param charArray input prefix
- * @param knownStart relative position in the dictionary to start
- * @return index of word, or -1 if not found
- */
- public int getPrefixMatch(char[] charArray, int knownStart) {
- short index = getWordItemTableIndex(charArray[0]);
- if (index == -1)
- return -1;
- char[][] items = wordItem_charArrayTable[wordIndexTable[index]];
- int start = knownStart, end = items.length - 1;
-
- int mid = (start + end) / 2, cmpResult;
-
- // Binary search for the index of idArray
- while (start <= end) {
- cmpResult = Utility.compareArrayByPrefix(charArray, 1, items[mid], 0);
- if (cmpResult == 0) {
- // Get the first item which match the current word
- while (mid >= 0
- && Utility.compareArrayByPrefix(charArray, 1, items[mid], 0) == 0)
- mid--;
- mid++;
- return mid;// Find the first word that uses charArray as prefix.
- } else if (cmpResult < 0)
- end = mid - 1;
- else
- start = mid + 1;
- mid = (start + end) / 2;
- }
- return -1;
- }
-
- /**
- * Get the frequency of a word from the dictionary
- *
- * @param charArray input word
- * @return word frequency, or zero if the word is not found
- */
- public int getFrequency(char[] charArray) {
- short hashIndex = getWordItemTableIndex(charArray[0]);
- if (hashIndex == -1)
- return 0;
- int itemIndex = findInTable(hashIndex, charArray);
- if (itemIndex != -1)
- return wordItem_frequencyTable[wordIndexTable[hashIndex]][itemIndex];
- return 0;
-
- }
-
- /**
- * Return true if the dictionary entry at itemIndex for table charArray[0] is charArray
- *
- * @param charArray input word
- * @param itemIndex item index for table charArray[0]
- * @return true if the entry exists
- */
- public boolean isEqual(char[] charArray, int itemIndex) {
- short hashIndex = getWordItemTableIndex(charArray[0]);
- return Utility.compareArray(charArray, 1,
- wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0;
- }
-
-}