lucene-java-3.5.0/lucene/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java

   1 /**
   2  * Licensed to the Apache Software Foundation (ASF) under one or more
   3  * contributor license agreements.  See the NOTICE file distributed with
   4  * this work for additional information regarding copyright ownership.
   5  * The ASF licenses this file to You under the Apache License, Version 2.0
   6  * (the "License"); you may not use this file except in compliance with
   7  * the License.  You may obtain a copy of the License at
   8  *
   9  *     http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 package org.apache.lucene.analysis.cn.smart.hhmm;
  19
  20 import java.util.List;
  21
  22 import org.apache.lucene.analysis.cn.smart.CharType;
  23 import org.apache.lucene.analysis.cn.smart.Utility;
  24 import org.apache.lucene.analysis.cn.smart.WordType;
  25 import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;//javadoc @link
  26
  27 /**
  28  * Finds the optimal segmentation of a sentence into Chinese words
  29  * @lucene.experimental
  30  */
  31 public class HHMMSegmenter {
  32
  33   private static WordDictionary wordDict = WordDictionary.getInstance();
  34
  35   /**
  36    * Create the {@link SegGraph} for a sentence.
  37    *
  38    * @param sentence input sentence, without start and end markers
  39    * @return {@link SegGraph} corresponding to the input sentence.
  40    */
  41   @SuppressWarnings("fallthrough")
  42   private SegGraph createSegGraph(String sentence) {
  43     int i = 0, j;
  44     int length = sentence.length();
  45     int foundIndex;
  46     int[] charTypeArray = getCharTypes(sentence);
  47     StringBuilder wordBuf = new StringBuilder();
  48     SegToken token;
  49     int frequency = 0; // the number of times word appears.
  50     boolean hasFullWidth;
  51     int wordType;
  52     char[] charArray;
  53
  54     SegGraph segGraph = new SegGraph();
  55     while (i < length) {
  56       hasFullWidth = false;
  57       switch (charTypeArray[i]) {
  58         case CharType.SPACE_LIKE:
  59           i++;
  60           break;
  61         case CharType.HANZI:
  62           j = i + 1;
  63           wordBuf.delete(0, wordBuf.length());
  64           // It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not,
  65           // it will store that single Chinese character (Hanzi) in the SegGraph.  Otherwise, it will
  66           // cause word division.
  67           wordBuf.append(sentence.charAt(i));
  68           charArray = new char[] { sentence.charAt(i) };
  69           frequency = wordDict.getFrequency(charArray);
  70           token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
  71               frequency);
  72           segGraph.addToken(token);
  73
  74           foundIndex = wordDict.getPrefixMatch(charArray);
  75           while (j <= length && foundIndex != -1) {
  76             if (wordDict.isEqual(charArray, foundIndex) && charArray.length > 1) {
  77               // It is the phrase we are looking for; In other words, we have found a phrase SegToken
  78               // from i to j.  It is not a monosyllabic word (single word).
  79               frequency = wordDict.getFrequency(charArray);
  80               token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
  81                   frequency);
  82               segGraph.addToken(token);
  83             }
  84
  85             while (j < length && charTypeArray[j] == CharType.SPACE_LIKE)
  86               j++;
  87
  88             if (j < length && charTypeArray[j] == CharType.HANZI) {
  89               wordBuf.append(sentence.charAt(j));
  90               charArray = new char[wordBuf.length()];
  91               wordBuf.getChars(0, charArray.length, charArray, 0);
  92               // idArray has been found (foundWordIndex!=-1) as a prefix before.
  93               // Therefore, idArray after it has been lengthened can only appear after foundWordIndex.
  94               // So start searching after foundWordIndex.
  95               foundIndex = wordDict.getPrefixMatch(charArray, foundIndex);
  96               j++;
  97             } else {
  98               break;
  99             }
 100           }
 101           i++;
 102           break;
 103         case CharType.FULLWIDTH_LETTER:
 104           hasFullWidth = true; /* intentional fallthrough */
 105         case CharType.LETTER:
 106           j = i + 1;
 107           while (j < length
 108               && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER)) {
 109             if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
 110               hasFullWidth = true;
 111             j++;
 112           }
 113           // Found a Token from i to j. Type is LETTER char string.
 114           charArray = Utility.STRING_CHAR_ARRAY;
 115           frequency = wordDict.getFrequency(charArray);
 116           wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
 117           token = new SegToken(charArray, i, j, wordType, frequency);
 118           segGraph.addToken(token);
 119           i = j;
 120           break;
 121         case CharType.FULLWIDTH_DIGIT:
 122           hasFullWidth = true; /* intentional fallthrough */
 123         case CharType.DIGIT:
 124           j = i + 1;
 125           while (j < length
 126               && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT)) {
 127             if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
 128               hasFullWidth = true;
 129             j++;
 130           }
 131           // Found a Token from i to j. Type is NUMBER char string.
 132           charArray = Utility.NUMBER_CHAR_ARRAY;
 133           frequency = wordDict.getFrequency(charArray);
 134           wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
 135           token = new SegToken(charArray, i, j, wordType, frequency);
 136           segGraph.addToken(token);
 137           i = j;
 138           break;
 139         case CharType.DELIMITER:
 140           j = i + 1;
 141           // No need to search the weight for the punctuation.  Picking the highest frequency will work.
 142           frequency = Utility.MAX_FREQUENCE;
 143           charArray = new char[] { sentence.charAt(i) };
 144           token = new SegToken(charArray, i, j, WordType.DELIMITER, frequency);
 145           segGraph.addToken(token);
 146           i = j;
 147           break;
 148         default:
 149           j = i + 1;
 150           // Treat the unrecognized char symbol as unknown string.
 151           // For example, any symbol not in GB2312 is treated as one of these.
 152           charArray = Utility.STRING_CHAR_ARRAY;
 153           frequency = wordDict.getFrequency(charArray);
 154           token = new SegToken(charArray, i, j, WordType.STRING, frequency);
 155           segGraph.addToken(token);
 156           i = j;
 157           break;
 158       }
 159     }
 160
 161     // Add two more Tokens: "beginning xx beginning"
 162     charArray = Utility.START_CHAR_ARRAY;
 163     frequency = wordDict.getFrequency(charArray);
 164     token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency);
 165     segGraph.addToken(token);
 166
 167     // "end xx end"
 168     charArray = Utility.END_CHAR_ARRAY;
 169     frequency = wordDict.getFrequency(charArray);
 170     token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END,
 171         frequency);
 172     segGraph.addToken(token);
 173
 174     return segGraph;
 175   }
 176
 177   /**
 178    * Get the character types for every character in a sentence.
 179    *
 180    * @see Utility#getCharType(char)
 181    * @param sentence input sentence
 182    * @return array of character types corresponding to character positions in the sentence
 183    */
 184   private static int[] getCharTypes(String sentence) {
 185     int length = sentence.length();
 186     int[] charTypeArray = new int[length];
 187     // the type of each character by position
 188     for (int i = 0; i < length; i++) {
 189       charTypeArray[i] = Utility.getCharType(sentence.charAt(i));
 190     }
 191
 192     return charTypeArray;
 193   }
 194
 195   /**
 196    * Return a list of {@link SegToken} representing the best segmentation of a sentence
 197    * @param sentence input sentence
 198    * @return best segmentation as a {@link List}
 199    */
 200   public List<SegToken> process(String sentence) {
 201     SegGraph segGraph = createSegGraph(sentence);
 202     BiSegGraph biSegGraph = new BiSegGraph(segGraph);
 203     List<SegToken> shortPath = biSegGraph.getShortPath();
 204     return shortPath;
 205   }
 206 }