lucene-java-3.4.0/lucene/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java

   1 /**
   2  * Licensed to the Apache Software Foundation (ASF) under one or more
   3  * contributor license agreements.  See the NOTICE file distributed with
   4  * this work for additional information regarding copyright ownership.
   5  * The ASF licenses this file to You under the Apache License, Version 2.0
   6  * (the "License"); you may not use this file except in compliance with
   7  * the License.  You may obtain a copy of the License at
   8  *
   9  *     http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 package org.apache.lucene.analysis.cn.smart.hhmm;
  19
  20 import java.io.File;
  21 import java.io.FileInputStream;
  22 import java.io.FileNotFoundException;
  23 import java.io.FileOutputStream;
  24 import java.io.IOException;
  25 import java.io.InputStream;
  26 import java.io.ObjectInputStream;
  27 import java.io.ObjectOutputStream;
  28 import java.io.RandomAccessFile;
  29 import java.io.UnsupportedEncodingException;
  30 import java.nio.ByteBuffer;
  31 import java.nio.ByteOrder;
  32
  33 import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
  34 import org.apache.lucene.analysis.cn.smart.Utility;
  35
  36 /**
  37  * SmartChineseAnalyzer Word Dictionary
  38  * @lucene.experimental
  39  */
  40 class WordDictionary extends AbstractDictionary {
  41
  42   private WordDictionary() {
  43   }
  44
  45   private static WordDictionary singleInstance;
  46
  47   /**
  48    * Large prime number for hash function
  49    */
  50   public static final int PRIME_INDEX_LENGTH = 12071;
  51
  52   /**
  53    * wordIndexTable guarantees to hash all Chinese characters in Unicode into
  54    * PRIME_INDEX_LENGTH array. There will be conflict, but in reality this
  55    * program only handles the 6768 characters found in GB2312 plus some
  56    * ASCII characters. Therefore in order to guarantee better precision, it is
  57    * necessary to retain the original symbol in the charIndexTable.
  58    */
  59   private short[] wordIndexTable;
  60
  61   private char[] charIndexTable;
  62
  63   /**
  64    * To avoid taking too much space, the data structure needed to store the
  65    * lexicon requires two multidimensional arrays to store word and frequency.
  66    * Each word is placed in a char[]. Each char represents a Chinese char or
  67    * other symbol.  Each frequency is put into an int. These two arrays
  68    * correspond to each other one-to-one. Therefore, one can use
  69    * wordItem_charArrayTable[i][j] to look up word from lexicon, and
  70    * wordItem_frequencyTable[i][j] to look up the corresponding frequency.
  71    */
  72   private char[][][] wordItem_charArrayTable;
  73
  74   private int[][] wordItem_frequencyTable;
  75
  76   // static Logger log = Logger.getLogger(WordDictionary.class);
  77
  78   /**
  79    * Get the singleton dictionary instance.
  80    * @return singleton
  81    */
  82   public synchronized static WordDictionary getInstance() {
  83     if (singleInstance == null) {
  84       singleInstance = new WordDictionary();
  85       try {
  86         singleInstance.load();
  87       } catch (IOException e) {
  88         String wordDictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
  89         singleInstance.load(wordDictRoot);
  90       } catch (ClassNotFoundException e) {
  91         throw new RuntimeException(e);
  92       }
  93     }
  94     return singleInstance;
  95   }
  96
  97   /**
  98    * Attempt to load dictionary from provided directory, first trying coredict.mem, failing back on coredict.dct
  99    *
 100    * @param dctFileRoot path to dictionary directory
 101    */
 102   public void load(String dctFileRoot) {
 103     String dctFilePath = dctFileRoot + "/coredict.dct";
 104     File serialObj = new File(dctFileRoot + "/coredict.mem");
 105
 106     if (serialObj.exists() && loadFromObj(serialObj)) {
 107
 108     } else {
 109       try {
 110         wordIndexTable = new short[PRIME_INDEX_LENGTH];
 111         charIndexTable = new char[PRIME_INDEX_LENGTH];
 112         for (int i = 0; i < PRIME_INDEX_LENGTH; i++) {
 113           charIndexTable[i] = 0;
 114           wordIndexTable[i] = -1;
 115         }
 116         wordItem_charArrayTable = new char[GB2312_CHAR_NUM][][];
 117         wordItem_frequencyTable = new int[GB2312_CHAR_NUM][];
 118         // int total =
 119         loadMainDataFromFile(dctFilePath);
 120         expandDelimiterData();
 121         mergeSameWords();
 122         sortEachItems();
 123         // log.info("load dictionary: " + dctFilePath + " total:" + total);
 124       } catch (IOException e) {
 125         throw new RuntimeException(e.getMessage());
 126       }
 127
 128       saveToObj(serialObj);
 129     }
 130
 131   }
 132
 133   /**
 134    * Load coredict.mem internally from the jar file.
 135    *
 136    * @throws ClassNotFoundException
 137    * @throws IOException
 138    */
 139   public void load() throws IOException, ClassNotFoundException {
 140     InputStream input = this.getClass().getResourceAsStream("coredict.mem");
 141     loadFromObjectInputStream(input);
 142   }
 143
 144   private boolean loadFromObj(File serialObj) {
 145     try {
 146       loadFromObjectInputStream(new FileInputStream(serialObj));
 147       return true;
 148     } catch (FileNotFoundException e) {
 149       e.printStackTrace();
 150     } catch (IOException e) {
 151       e.printStackTrace();
 152     } catch (ClassNotFoundException e) {
 153       e.printStackTrace();
 154     }
 155     return false;
 156   }
 157
 158   private void loadFromObjectInputStream(InputStream serialObjectInputStream)
 159       throws IOException, ClassNotFoundException {
 160     ObjectInputStream input = new ObjectInputStream(serialObjectInputStream);
 161     wordIndexTable = (short[]) input.readObject();
 162     charIndexTable = (char[]) input.readObject();
 163     wordItem_charArrayTable = (char[][][]) input.readObject();
 164     wordItem_frequencyTable = (int[][]) input.readObject();
 165     // log.info("load core dict from serialization.");
 166     input.close();
 167   }
 168
 169   private void saveToObj(File serialObj) {
 170     try {
 171       ObjectOutputStream output = new ObjectOutputStream(new FileOutputStream(
 172           serialObj));
 173       output.writeObject(wordIndexTable);
 174       output.writeObject(charIndexTable);
 175       output.writeObject(wordItem_charArrayTable);
 176       output.writeObject(wordItem_frequencyTable);
 177       output.close();
 178       // log.info("serialize core dict.");
 179     } catch (Exception e) {
 180       // log.warn(e.getMessage());
 181     }
 182   }
 183
 184   /**
 185    * Load the datafile into this WordDictionary
 186    *
 187    * @param dctFilePath path to word dictionary (coredict.dct)
 188    * @return number of words read
 189    * @throws FileNotFoundException
 190    * @throws IOException
 191    * @throws UnsupportedEncodingException
 192    */
 193   private int loadMainDataFromFile(String dctFilePath)
 194       throws FileNotFoundException, IOException, UnsupportedEncodingException {
 195     int i, cnt, length, total = 0;
 196     // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
 197     // The 3756th is used (as a header) to store information.
 198     int[] buffer = new int[3];
 199     byte[] intBuffer = new byte[4];
 200     String tmpword;
 201     RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");
 202
 203     // GB2312 characters 0 - 6768
 204     for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
 205       // if (i == 5231)
 206       // System.out.println(i);
 207
 208       dctFile.read(intBuffer);
 209       // the dictionary was developed for C, and byte order must be converted to work with Java
 210       cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
 211       if (cnt <= 0) {
 212         wordItem_charArrayTable[i] = null;
 213         wordItem_frequencyTable[i] = null;
 214         continue;
 215       }
 216       wordItem_charArrayTable[i] = new char[cnt][];
 217       wordItem_frequencyTable[i] = new int[cnt];
 218       total += cnt;
 219       int j = 0;
 220       while (j < cnt) {
 221         // wordItemTable[i][j] = new WordItem();
 222         dctFile.read(intBuffer);
 223         buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
 224             .getInt();// frequency
 225         dctFile.read(intBuffer);
 226         buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
 227             .getInt();// length
 228         dctFile.read(intBuffer);
 229         buffer[2] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
 230             .getInt();// handle
 231
 232         // wordItemTable[i][j].frequency = buffer[0];
 233         wordItem_frequencyTable[i][j] = buffer[0];
 234
 235         length = buffer[1];
 236         if (length > 0) {
 237           byte[] lchBuffer = new byte[length];
 238           dctFile.read(lchBuffer);
 239           tmpword = new String(lchBuffer, "GB2312");
 240           // indexTable[i].wordItems[j].word = tmpword;
 241           // wordItemTable[i][j].charArray = tmpword.toCharArray();
 242           wordItem_charArrayTable[i][j] = tmpword.toCharArray();
 243         } else {
 244           // wordItemTable[i][j].charArray = null;
 245           wordItem_charArrayTable[i][j] = null;
 246         }
 247         // System.out.println(indexTable[i].wordItems[j]);
 248         j++;
 249       }
 250
 251       String str = getCCByGB2312Id(i);
 252       setTableIndex(str.charAt(0), i);
 253     }
 254     dctFile.close();
 255     return total;
 256   }
 257
 258   /**
 259    * The original lexicon puts all information with punctuation into a
 260    * chart (from 1 to 3755). Here it then gets expanded, separately being
 261    * placed into the chart that has the corresponding symbol.
 262    */
 263   private void expandDelimiterData() {
 264     int i;
 265     int cnt;
 266     // Punctuation then treating index 3755 as 1,
 267     // distribute the original punctuation corresponding dictionary into
 268     int delimiterIndex = 3755 + GB2312_FIRST_CHAR;
 269     i = 0;
 270     while (i < wordItem_charArrayTable[delimiterIndex].length) {
 271       char c = wordItem_charArrayTable[delimiterIndex][i][0];
 272       int j = getGB2312Id(c);// the id value of the punctuation
 273       if (wordItem_charArrayTable[j] == null) {
 274
 275         int k = i;
 276         // Starting from i, count the number of the following worditem symbol from j
 277         while (k < wordItem_charArrayTable[delimiterIndex].length
 278             && wordItem_charArrayTable[delimiterIndex][k][0] == c) {
 279           k++;
 280         }
 281         // c is the punctuation character, j is the id value of c
 282         // k-1 represents the index of the last punctuation character
 283         cnt = k - i;
 284         if (cnt != 0) {
 285           wordItem_charArrayTable[j] = new char[cnt][];
 286           wordItem_frequencyTable[j] = new int[cnt];
 287         }
 288
 289         // Assign value for each wordItem.
 290         for (k = 0; k < cnt; k++, i++) {
 291           // wordItemTable[j][k] = new WordItem();
 292           wordItem_frequencyTable[j][k] = wordItem_frequencyTable[delimiterIndex][i];
 293           wordItem_charArrayTable[j][k] = new char[wordItem_charArrayTable[delimiterIndex][i].length - 1];
 294           System.arraycopy(wordItem_charArrayTable[delimiterIndex][i], 1,
 295               wordItem_charArrayTable[j][k], 0,
 296               wordItem_charArrayTable[j][k].length);
 297         }
 298         setTableIndex(c, j);
 299       }
 300     }
 301     // Delete the original corresponding symbol array.
 302     wordItem_charArrayTable[delimiterIndex] = null;
 303     wordItem_frequencyTable[delimiterIndex] = null;
 304   }
 305
 306   /*
 307    * since we aren't doing POS-tagging, merge the frequencies for entries of the same word (with different POS)
 308    */
 309   private void mergeSameWords() {
 310     int i;
 311     for (i = 0; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
 312       if (wordItem_charArrayTable[i] == null)
 313         continue;
 314       int len = 1;
 315       for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
 316         if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
 317             wordItem_charArrayTable[i][j - 1], 0) != 0)
 318           len++;
 319
 320       }
 321       if (len < wordItem_charArrayTable[i].length) {
 322         char[][] tempArray = new char[len][];
 323         int[] tempFreq = new int[len];
 324         int k = 0;
 325         tempArray[0] = wordItem_charArrayTable[i][0];
 326         tempFreq[0] = wordItem_frequencyTable[i][0];
 327         for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
 328           if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
 329               tempArray[k], 0) != 0) {
 330             k++;
 331             // temp[k] = wordItemTable[i][j];
 332             tempArray[k] = wordItem_charArrayTable[i][j];
 333             tempFreq[k] = wordItem_frequencyTable[i][j];
 334           } else {
 335             // temp[k].frequency += wordItemTable[i][j].frequency;
 336             tempFreq[k] += wordItem_frequencyTable[i][j];
 337           }
 338         }
 339         // wordItemTable[i] = temp;
 340         wordItem_charArrayTable[i] = tempArray;
 341         wordItem_frequencyTable[i] = tempFreq;
 342       }
 343     }
 344   }
 345
 346   private void sortEachItems() {
 347     char[] tmpArray;
 348     int tmpFreq;
 349     for (int i = 0; i < wordItem_charArrayTable.length; i++) {
 350       if (wordItem_charArrayTable[i] != null
 351           && wordItem_charArrayTable[i].length > 1) {
 352         for (int j = 0; j < wordItem_charArrayTable[i].length - 1; j++) {
 353           for (int j2 = j + 1; j2 < wordItem_charArrayTable[i].length; j2++) {
 354             if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
 355                 wordItem_charArrayTable[i][j2], 0) > 0) {
 356               tmpArray = wordItem_charArrayTable[i][j];
 357               tmpFreq = wordItem_frequencyTable[i][j];
 358               wordItem_charArrayTable[i][j] = wordItem_charArrayTable[i][j2];
 359               wordItem_frequencyTable[i][j] = wordItem_frequencyTable[i][j2];
 360               wordItem_charArrayTable[i][j2] = tmpArray;
 361               wordItem_frequencyTable[i][j2] = tmpFreq;
 362             }
 363           }
 364         }
 365       }
 366     }
 367   }
 368
 369   /*
 370    * Calculate character c's position in hash table,
 371    * then initialize the value of that position in the address table.
 372    */
 373   private boolean setTableIndex(char c, int j) {
 374     int index = getAvaliableTableIndex(c);
 375     if (index != -1) {
 376       charIndexTable[index] = c;
 377       wordIndexTable[index] = (short) j;
 378       return true;
 379     } else
 380       return false;
 381   }
 382
 383   private short getAvaliableTableIndex(char c) {
 384     int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
 385     int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
 386     if (hash1 < 0)
 387       hash1 = PRIME_INDEX_LENGTH + hash1;
 388     if (hash2 < 0)
 389       hash2 = PRIME_INDEX_LENGTH + hash2;
 390     int index = hash1;
 391     int i = 1;
 392     while (charIndexTable[index] != 0 && charIndexTable[index] != c
 393         && i < PRIME_INDEX_LENGTH) {
 394       index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
 395       i++;
 396     }
 397     // System.out.println(i - 1);
 398
 399     if (i < PRIME_INDEX_LENGTH
 400         && (charIndexTable[index] == 0 || charIndexTable[index] == c)) {
 401       return (short) index;
 402     } else
 403       return -1;
 404   }
 405
 406   private short getWordItemTableIndex(char c) {
 407     int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
 408     int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
 409     if (hash1 < 0)
 410       hash1 = PRIME_INDEX_LENGTH + hash1;
 411     if (hash2 < 0)
 412       hash2 = PRIME_INDEX_LENGTH + hash2;
 413     int index = hash1;
 414     int i = 1;
 415     while (charIndexTable[index] != 0 && charIndexTable[index] != c
 416         && i < PRIME_INDEX_LENGTH) {
 417       index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
 418       i++;
 419     }
 420
 421     if (i < PRIME_INDEX_LENGTH && charIndexTable[index] == c) {
 422       return (short) index;
 423     } else
 424       return -1;
 425   }
 426
 427   /**
 428    * Look up the text string corresponding with the word char array,
 429    * and return the position of the word list.
 430    *
 431    * @param knownHashIndex already figure out position of the first word
 432    *   symbol charArray[0] in hash table. If not calculated yet, can be
 433    *   replaced with function int findInTable(char[] charArray).
 434    * @param charArray look up the char array corresponding with the word.
 435    * @return word location in word array.  If not found, then return -1.
 436    */
 437   private int findInTable(short knownHashIndex, char[] charArray) {
 438     if (charArray == null || charArray.length == 0)
 439       return -1;
 440
 441     char[][] items = wordItem_charArrayTable[wordIndexTable[knownHashIndex]];
 442     int start = 0, end = items.length - 1;
 443     int mid = (start + end) / 2, cmpResult;
 444
 445     // Binary search for the index of idArray
 446     while (start <= end) {
 447       cmpResult = Utility.compareArray(items[mid], 0, charArray, 1);
 448
 449       if (cmpResult == 0)
 450         return mid;// find it
 451       else if (cmpResult < 0)
 452         start = mid + 1;
 453       else if (cmpResult > 0)
 454         end = mid - 1;
 455
 456       mid = (start + end) / 2;
 457     }
 458     return -1;
 459   }
 460
 461   /**
 462    * Find the first word in the dictionary that starts with the supplied prefix
 463    *
 464    * @see #getPrefixMatch(char[], int)
 465    * @param charArray input prefix
 466    * @return index of word, or -1 if not found
 467    */
 468   public int getPrefixMatch(char[] charArray) {
 469     return getPrefixMatch(charArray, 0);
 470   }
 471
 472   /**
 473    * Find the nth word in the dictionary that starts with the supplied prefix
 474    *
 475    * @see #getPrefixMatch(char[])
 476    * @param charArray input prefix
 477    * @param knownStart relative position in the dictionary to start
 478    * @return index of word, or -1 if not found
 479    */
 480   public int getPrefixMatch(char[] charArray, int knownStart) {
 481     short index = getWordItemTableIndex(charArray[0]);
 482     if (index == -1)
 483       return -1;
 484     char[][] items = wordItem_charArrayTable[wordIndexTable[index]];
 485     int start = knownStart, end = items.length - 1;
 486
 487     int mid = (start + end) / 2, cmpResult;
 488
 489     // Binary search for the index of idArray
 490     while (start <= end) {
 491       cmpResult = Utility.compareArrayByPrefix(charArray, 1, items[mid], 0);
 492       if (cmpResult == 0) {
 493         // Get the first item which match the current word
 494         while (mid >= 0
 495             && Utility.compareArrayByPrefix(charArray, 1, items[mid], 0) == 0)
 496           mid--;
 497         mid++;
 498         return mid;// Find the first word that uses charArray as prefix.
 499       } else if (cmpResult < 0)
 500         end = mid - 1;
 501       else
 502         start = mid + 1;
 503       mid = (start + end) / 2;
 504     }
 505     return -1;
 506   }
 507
 508   /**
 509    * Get the frequency of a word from the dictionary
 510    *
 511    * @param charArray input word
 512    * @return word frequency, or zero if the word is not found
 513    */
 514   public int getFrequency(char[] charArray) {
 515     short hashIndex = getWordItemTableIndex(charArray[0]);
 516     if (hashIndex == -1)
 517       return 0;
 518     int itemIndex = findInTable(hashIndex, charArray);
 519     if (itemIndex != -1)
 520       return wordItem_frequencyTable[wordIndexTable[hashIndex]][itemIndex];
 521     return 0;
 522
 523   }
 524
 525   /**
 526    * Return true if the dictionary entry at itemIndex for table charArray[0] is charArray
 527    *
 528    * @param charArray input word
 529    * @param itemIndex item index for table charArray[0]
 530    * @return true if the entry exists
 531    */
 532   public boolean isEqual(char[] charArray, int itemIndex) {
 533     short hashIndex = getWordItemTableIndex(charArray[0]);
 534     return Utility.compareArray(charArray, 1,
 535         wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0;
 536   }
 537
 538 }