2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 package org.apache.lucene.analysis.cn.smart.hhmm;
20 import java.util.List;
22 import org.apache.lucene.analysis.cn.smart.CharType;
23 import org.apache.lucene.analysis.cn.smart.Utility;
24 import org.apache.lucene.analysis.cn.smart.WordType;
25 import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;//javadoc @link
28 * Finds the optimal segmentation of a sentence into Chinese words
29 * @lucene.experimental
31 public class HHMMSegmenter {
33 private static WordDictionary wordDict = WordDictionary.getInstance();
36 * Create the {@link SegGraph} for a sentence.
38 * @param sentence input sentence, without start and end markers
39 * @return {@link SegGraph} corresponding to the input sentence.
41 @SuppressWarnings("fallthrough")
42 private SegGraph createSegGraph(String sentence) {
44 int length = sentence.length();
46 int[] charTypeArray = getCharTypes(sentence);
47 StringBuilder wordBuf = new StringBuilder();
49 int frequency = 0; // the number of times word appears.
54 SegGraph segGraph = new SegGraph();
57 switch (charTypeArray[i]) {
58 case CharType.SPACE_LIKE:
63 wordBuf.delete(0, wordBuf.length());
64 // It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not,
65 // it will store that single Chinese character (Hanzi) in the SegGraph. Otherwise, it will
66 // cause word division.
67 wordBuf.append(sentence.charAt(i));
68 charArray = new char[] { sentence.charAt(i) };
69 frequency = wordDict.getFrequency(charArray);
70 token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
72 segGraph.addToken(token);
74 foundIndex = wordDict.getPrefixMatch(charArray);
75 while (j <= length && foundIndex != -1) {
76 if (wordDict.isEqual(charArray, foundIndex) && charArray.length > 1) {
77 // It is the phrase we are looking for; In other words, we have found a phrase SegToken
78 // from i to j. It is not a monosyllabic word (single word).
79 frequency = wordDict.getFrequency(charArray);
80 token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
82 segGraph.addToken(token);
85 while (j < length && charTypeArray[j] == CharType.SPACE_LIKE)
88 if (j < length && charTypeArray[j] == CharType.HANZI) {
89 wordBuf.append(sentence.charAt(j));
90 charArray = new char[wordBuf.length()];
91 wordBuf.getChars(0, charArray.length, charArray, 0);
92 // idArray has been found (foundWordIndex!=-1) as a prefix before.
93 // Therefore, idArray after it has been lengthened can only appear after foundWordIndex.
94 // So start searching after foundWordIndex.
95 foundIndex = wordDict.getPrefixMatch(charArray, foundIndex);
103 case CharType.FULLWIDTH_LETTER:
104 hasFullWidth = true; /* intentional fallthrough */
105 case CharType.LETTER:
108 && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER)) {
109 if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
113 // Found a Token from i to j. Type is LETTER char string.
114 charArray = Utility.STRING_CHAR_ARRAY;
115 frequency = wordDict.getFrequency(charArray);
116 wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
117 token = new SegToken(charArray, i, j, wordType, frequency);
118 segGraph.addToken(token);
121 case CharType.FULLWIDTH_DIGIT:
122 hasFullWidth = true; /* intentional fallthrough */
126 && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT)) {
127 if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
131 // Found a Token from i to j. Type is NUMBER char string.
132 charArray = Utility.NUMBER_CHAR_ARRAY;
133 frequency = wordDict.getFrequency(charArray);
134 wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
135 token = new SegToken(charArray, i, j, wordType, frequency);
136 segGraph.addToken(token);
139 case CharType.DELIMITER:
141 // No need to search the weight for the punctuation. Picking the highest frequency will work.
142 frequency = Utility.MAX_FREQUENCE;
143 charArray = new char[] { sentence.charAt(i) };
144 token = new SegToken(charArray, i, j, WordType.DELIMITER, frequency);
145 segGraph.addToken(token);
150 // Treat the unrecognized char symbol as unknown string.
151 // For example, any symbol not in GB2312 is treated as one of these.
152 charArray = Utility.STRING_CHAR_ARRAY;
153 frequency = wordDict.getFrequency(charArray);
154 token = new SegToken(charArray, i, j, WordType.STRING, frequency);
155 segGraph.addToken(token);
161 // Add two more Tokens: "beginning xx beginning"
162 charArray = Utility.START_CHAR_ARRAY;
163 frequency = wordDict.getFrequency(charArray);
164 token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency);
165 segGraph.addToken(token);
168 charArray = Utility.END_CHAR_ARRAY;
169 frequency = wordDict.getFrequency(charArray);
170 token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END,
172 segGraph.addToken(token);
178 * Get the character types for every character in a sentence.
180 * @see Utility#getCharType(char)
181 * @param sentence input sentence
182 * @return array of character types corresponding to character positions in the sentence
184 private static int[] getCharTypes(String sentence) {
185 int length = sentence.length();
186 int[] charTypeArray = new int[length];
187 // the type of each character by position
188 for (int i = 0; i < length; i++) {
189 charTypeArray[i] = Utility.getCharType(sentence.charAt(i));
192 return charTypeArray;
196 * Return a list of {@link SegToken} representing the best segmentation of a sentence
197 * @param sentence input sentence
198 * @return best segmentation as a {@link List}
200 public List<SegToken> process(String sentence) {
201 SegGraph segGraph = createSegGraph(sentence);
202 BiSegGraph biSegGraph = new BiSegGraph(segGraph);
203 List<SegToken> shortPath = biSegGraph.getShortPath();