--- /dev/null
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cn.smart.hhmm;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.RandomAccessFile;
+import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+
+import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
+
+/**
+ * SmartChineseAnalyzer Bigram dictionary.
+ * @lucene.experimental
+ */
+class BigramDictionary extends AbstractDictionary {
+
+ private BigramDictionary() {
+ }
+
+ public static final char WORD_SEGMENT_CHAR = '@';
+
+ private static BigramDictionary singleInstance;
+
+ public static final int PRIME_BIGRAM_LENGTH = 402137;
+
+ /*
+ * The word associations are stored as FNV1 hashcodes, which have a small probability of collision, but save memory.
+ */
+ private long[] bigramHashTable;
+
+ private int[] frequencyTable;
+
+ private int max = 0;
+
+ private int repeat = 0;
+
+ // static Logger log = Logger.getLogger(BigramDictionary.class);
+
+ public synchronized static BigramDictionary getInstance() {
+ if (singleInstance == null) {
+ singleInstance = new BigramDictionary();
+ try {
+ singleInstance.load();
+ } catch (IOException e) {
+ String dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
+ singleInstance.load(dictRoot);
+ } catch (ClassNotFoundException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ return singleInstance;
+ }
+
+ private boolean loadFromObj(File serialObj) {
+ try {
+ loadFromInputStream(new FileInputStream(serialObj));
+ return true;
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (ClassNotFoundException e) {
+ e.printStackTrace();
+ }
+ return false;
+ }
+
+ private void loadFromInputStream(InputStream serialObjectInputStream)
+ throws IOException, ClassNotFoundException {
+ ObjectInputStream input = new ObjectInputStream(serialObjectInputStream);
+ bigramHashTable = (long[]) input.readObject();
+ frequencyTable = (int[]) input.readObject();
+ // log.info("load bigram dict from serialization.");
+ input.close();
+ }
+
+ private void saveToObj(File serialObj) {
+ try {
+ ObjectOutputStream output = new ObjectOutputStream(new FileOutputStream(
+ serialObj));
+ output.writeObject(bigramHashTable);
+ output.writeObject(frequencyTable);
+ output.close();
+ // log.info("serialize bigram dict.");
+ } catch (Exception e) {
+ // log.warn(e.getMessage());
+ }
+ }
+
+ private void load() throws IOException, ClassNotFoundException {
+ InputStream input = this.getClass().getResourceAsStream("bigramdict.mem");
+ loadFromInputStream(input);
+ }
+
+ private void load(String dictRoot) {
+ String bigramDictPath = dictRoot + "/bigramdict.dct";
+
+ File serialObj = new File(dictRoot + "/bigramdict.mem");
+
+ if (serialObj.exists() && loadFromObj(serialObj)) {
+
+ } else {
+ try {
+ bigramHashTable = new long[PRIME_BIGRAM_LENGTH];
+ frequencyTable = new int[PRIME_BIGRAM_LENGTH];
+ for (int i = 0; i < PRIME_BIGRAM_LENGTH; i++) {
+ // it is possible for a value to hash to 0, but the probability is extremely low
+ bigramHashTable[i] = 0;
+ frequencyTable[i] = 0;
+ }
+ loadFromFile(bigramDictPath);
+ } catch (IOException e) {
+ throw new RuntimeException(e.getMessage());
+ }
+ saveToObj(serialObj);
+ }
+ }
+
+ /**
+ * Load the datafile into this BigramDictionary
+ *
+ * @param dctFilePath path to the Bigramdictionary (bigramdict.dct)
+ * @throws FileNotFoundException
+ * @throws IOException
+ * @throws UnsupportedEncodingException
+ */
+ public void loadFromFile(String dctFilePath) throws FileNotFoundException,
+ IOException, UnsupportedEncodingException {
+
+ int i, cnt, length, total = 0;
+ // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
+ // The 3756th is used (as a header) to store information.
+ int[] buffer = new int[3];
+ byte[] intBuffer = new byte[4];
+ String tmpword;
+ RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");
+
+ // GB2312 characters 0 - 6768
+ for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
+ String currentStr = getCCByGB2312Id(i);
+ // if (i == 5231)
+ // System.out.println(i);
+
+ dctFile.read(intBuffer);
+ // the dictionary was developed for C, and byte order must be converted to work with Java
+ cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
+ if (cnt <= 0) {
+ continue;
+ }
+ total += cnt;
+ int j = 0;
+ while (j < cnt) {
+ dctFile.read(intBuffer);
+ buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
+ .getInt();// frequency
+ dctFile.read(intBuffer);
+ buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
+ .getInt();// length
+ dctFile.read(intBuffer);
+ // buffer[2] = ByteBuffer.wrap(intBuffer).order(
+ // ByteOrder.LITTLE_ENDIAN).getInt();// handle
+
+ length = buffer[1];
+ if (length > 0) {
+ byte[] lchBuffer = new byte[length];
+ dctFile.read(lchBuffer);
+ tmpword = new String(lchBuffer, "GB2312");
+ if (i != 3755 + GB2312_FIRST_CHAR) {
+ tmpword = currentStr + tmpword;
+ }
+ char carray[] = tmpword.toCharArray();
+ long hashId = hash1(carray);
+ int index = getAvaliableIndex(hashId, carray);
+ if (index != -1) {
+ if (bigramHashTable[index] == 0) {
+ bigramHashTable[index] = hashId;
+ // bigramStringTable[index] = tmpword;
+ }
+ frequencyTable[index] += buffer[0];
+ }
+ }
+ j++;
+ }
+ }
+ dctFile.close();
+ // log.info("load dictionary done! " + dctFilePath + " total:" + total);
+ }
+
+ private int getAvaliableIndex(long hashId, char carray[]) {
+ int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH);
+ int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH;
+ if (hash1 < 0)
+ hash1 = PRIME_BIGRAM_LENGTH + hash1;
+ if (hash2 < 0)
+ hash2 = PRIME_BIGRAM_LENGTH + hash2;
+ int index = hash1;
+ int i = 1;
+ while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
+ && i < PRIME_BIGRAM_LENGTH) {
+ index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
+ i++;
+ }
+ // System.out.println(i - 1);
+
+ if (i < PRIME_BIGRAM_LENGTH
+ && (bigramHashTable[index] == 0 || bigramHashTable[index] == hashId)) {
+ return index;
+ } else
+ return -1;
+ }
+
+ /*
+ * lookup the index into the frequency array.
+ */
+ private int getBigramItemIndex(char carray[]) {
+ long hashId = hash1(carray);
+ int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH);
+ int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH;
+ if (hash1 < 0)
+ hash1 = PRIME_BIGRAM_LENGTH + hash1;
+ if (hash2 < 0)
+ hash2 = PRIME_BIGRAM_LENGTH + hash2;
+ int index = hash1;
+ int i = 1;
+ repeat++;
+ while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
+ && i < PRIME_BIGRAM_LENGTH) {
+ index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
+ i++;
+ repeat++;
+ if (i > max)
+ max = i;
+ }
+ // System.out.println(i - 1);
+
+ if (i < PRIME_BIGRAM_LENGTH && bigramHashTable[index] == hashId) {
+ return index;
+ } else
+ return -1;
+ }
+
+ public int getFrequency(char[] carray) {
+ int index = getBigramItemIndex(carray);
+ if (index != -1)
+ return frequencyTable[index];
+ return 0;
+ }
+
+}