+++ /dev/null
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.analysis.cn.smart.hhmm;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.ObjectInputStream;
-import java.io.ObjectOutputStream;
-import java.io.RandomAccessFile;
-import java.io.UnsupportedEncodingException;
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-
-import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
-
-/**
- * SmartChineseAnalyzer Bigram dictionary.
- * @lucene.experimental
- */
-class BigramDictionary extends AbstractDictionary {
-
- private BigramDictionary() {
- }
-
- public static final char WORD_SEGMENT_CHAR = '@';
-
- private static BigramDictionary singleInstance;
-
- public static final int PRIME_BIGRAM_LENGTH = 402137;
-
- /*
- * The word associations are stored as FNV1 hashcodes, which have a small probability of collision, but save memory.
- */
- private long[] bigramHashTable;
-
- private int[] frequencyTable;
-
- private int max = 0;
-
- private int repeat = 0;
-
- // static Logger log = Logger.getLogger(BigramDictionary.class);
-
- public synchronized static BigramDictionary getInstance() {
- if (singleInstance == null) {
- singleInstance = new BigramDictionary();
- try {
- singleInstance.load();
- } catch (IOException e) {
- String dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
- singleInstance.load(dictRoot);
- } catch (ClassNotFoundException e) {
- throw new RuntimeException(e);
- }
- }
- return singleInstance;
- }
-
- private boolean loadFromObj(File serialObj) {
- try {
- loadFromInputStream(new FileInputStream(serialObj));
- return true;
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } catch (ClassNotFoundException e) {
- e.printStackTrace();
- }
- return false;
- }
-
- private void loadFromInputStream(InputStream serialObjectInputStream)
- throws IOException, ClassNotFoundException {
- ObjectInputStream input = new ObjectInputStream(serialObjectInputStream);
- bigramHashTable = (long[]) input.readObject();
- frequencyTable = (int[]) input.readObject();
- // log.info("load bigram dict from serialization.");
- input.close();
- }
-
- private void saveToObj(File serialObj) {
- try {
- ObjectOutputStream output = new ObjectOutputStream(new FileOutputStream(
- serialObj));
- output.writeObject(bigramHashTable);
- output.writeObject(frequencyTable);
- output.close();
- // log.info("serialize bigram dict.");
- } catch (Exception e) {
- // log.warn(e.getMessage());
- }
- }
-
- private void load() throws IOException, ClassNotFoundException {
- InputStream input = this.getClass().getResourceAsStream("bigramdict.mem");
- loadFromInputStream(input);
- }
-
- private void load(String dictRoot) {
- String bigramDictPath = dictRoot + "/bigramdict.dct";
-
- File serialObj = new File(dictRoot + "/bigramdict.mem");
-
- if (serialObj.exists() && loadFromObj(serialObj)) {
-
- } else {
- try {
- bigramHashTable = new long[PRIME_BIGRAM_LENGTH];
- frequencyTable = new int[PRIME_BIGRAM_LENGTH];
- for (int i = 0; i < PRIME_BIGRAM_LENGTH; i++) {
- // it is possible for a value to hash to 0, but the probability is extremely low
- bigramHashTable[i] = 0;
- frequencyTable[i] = 0;
- }
- loadFromFile(bigramDictPath);
- } catch (IOException e) {
- throw new RuntimeException(e.getMessage());
- }
- saveToObj(serialObj);
- }
- }
-
- /**
- * Load the datafile into this BigramDictionary
- *
- * @param dctFilePath path to the Bigramdictionary (bigramdict.dct)
- * @throws FileNotFoundException
- * @throws IOException
- * @throws UnsupportedEncodingException
- */
- public void loadFromFile(String dctFilePath) throws FileNotFoundException,
- IOException, UnsupportedEncodingException {
-
- int i, cnt, length, total = 0;
- // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
- // The 3756th is used (as a header) to store information.
- int[] buffer = new int[3];
- byte[] intBuffer = new byte[4];
- String tmpword;
- RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");
-
- // GB2312 characters 0 - 6768
- for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
- String currentStr = getCCByGB2312Id(i);
- // if (i == 5231)
- // System.out.println(i);
-
- dctFile.read(intBuffer);
- // the dictionary was developed for C, and byte order must be converted to work with Java
- cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
- if (cnt <= 0) {
- continue;
- }
- total += cnt;
- int j = 0;
- while (j < cnt) {
- dctFile.read(intBuffer);
- buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
- .getInt();// frequency
- dctFile.read(intBuffer);
- buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
- .getInt();// length
- dctFile.read(intBuffer);
- // buffer[2] = ByteBuffer.wrap(intBuffer).order(
- // ByteOrder.LITTLE_ENDIAN).getInt();// handle
-
- length = buffer[1];
- if (length > 0) {
- byte[] lchBuffer = new byte[length];
- dctFile.read(lchBuffer);
- tmpword = new String(lchBuffer, "GB2312");
- if (i != 3755 + GB2312_FIRST_CHAR) {
- tmpword = currentStr + tmpword;
- }
- char carray[] = tmpword.toCharArray();
- long hashId = hash1(carray);
- int index = getAvaliableIndex(hashId, carray);
- if (index != -1) {
- if (bigramHashTable[index] == 0) {
- bigramHashTable[index] = hashId;
- // bigramStringTable[index] = tmpword;
- }
- frequencyTable[index] += buffer[0];
- }
- }
- j++;
- }
- }
- dctFile.close();
- // log.info("load dictionary done! " + dctFilePath + " total:" + total);
- }
-
- private int getAvaliableIndex(long hashId, char carray[]) {
- int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH);
- int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH;
- if (hash1 < 0)
- hash1 = PRIME_BIGRAM_LENGTH + hash1;
- if (hash2 < 0)
- hash2 = PRIME_BIGRAM_LENGTH + hash2;
- int index = hash1;
- int i = 1;
- while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
- && i < PRIME_BIGRAM_LENGTH) {
- index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
- i++;
- }
- // System.out.println(i - 1);
-
- if (i < PRIME_BIGRAM_LENGTH
- && (bigramHashTable[index] == 0 || bigramHashTable[index] == hashId)) {
- return index;
- } else
- return -1;
- }
-
- /*
- * lookup the index into the frequency array.
- */
- private int getBigramItemIndex(char carray[]) {
- long hashId = hash1(carray);
- int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH);
- int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH;
- if (hash1 < 0)
- hash1 = PRIME_BIGRAM_LENGTH + hash1;
- if (hash2 < 0)
- hash2 = PRIME_BIGRAM_LENGTH + hash2;
- int index = hash1;
- int i = 1;
- repeat++;
- while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
- && i < PRIME_BIGRAM_LENGTH) {
- index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
- i++;
- repeat++;
- if (i > max)
- max = i;
- }
- // System.out.println(i - 1);
-
- if (i < PRIME_BIGRAM_LENGTH && bigramHashTable[index] == hashId) {
- return index;
- } else
- return -1;
- }
-
- public int getFrequency(char[] carray) {
- int index = getBigramItemIndex(carray);
- if (index != -1)
- return frequencyTable[index];
- return 0;
- }
-
-}