--- /dev/null
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cn.smart.hhmm;
+
+import java.util.List;
+
+import org.apache.lucene.analysis.cn.smart.CharType;
+import org.apache.lucene.analysis.cn.smart.Utility;
+import org.apache.lucene.analysis.cn.smart.WordType;
+import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;//javadoc @link
+
+/**
+ * Finds the optimal segmentation of a sentence into Chinese words
+ * @lucene.experimental
+ */
+public class HHMMSegmenter {
+
+ private static WordDictionary wordDict = WordDictionary.getInstance();
+
+ /**
+ * Create the {@link SegGraph} for a sentence.
+ *
+ * @param sentence input sentence, without start and end markers
+ * @return {@link SegGraph} corresponding to the input sentence.
+ */
+ @SuppressWarnings("fallthrough")
+ private SegGraph createSegGraph(String sentence) {
+ int i = 0, j;
+ int length = sentence.length();
+ int foundIndex;
+ int[] charTypeArray = getCharTypes(sentence);
+ StringBuilder wordBuf = new StringBuilder();
+ SegToken token;
+ int frequency = 0; // the number of times word appears.
+ boolean hasFullWidth;
+ int wordType;
+ char[] charArray;
+
+ SegGraph segGraph = new SegGraph();
+ while (i < length) {
+ hasFullWidth = false;
+ switch (charTypeArray[i]) {
+ case CharType.SPACE_LIKE:
+ i++;
+ break;
+ case CharType.HANZI:
+ j = i + 1;
+ wordBuf.delete(0, wordBuf.length());
+ // It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not,
+ // it will store that single Chinese character (Hanzi) in the SegGraph. Otherwise, it will
+ // cause word division.
+ wordBuf.append(sentence.charAt(i));
+ charArray = new char[] { sentence.charAt(i) };
+ frequency = wordDict.getFrequency(charArray);
+ token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
+ frequency);
+ segGraph.addToken(token);
+
+ foundIndex = wordDict.getPrefixMatch(charArray);
+ while (j <= length && foundIndex != -1) {
+ if (wordDict.isEqual(charArray, foundIndex) && charArray.length > 1) {
+ // It is the phrase we are looking for; In other words, we have found a phrase SegToken
+ // from i to j. It is not a monosyllabic word (single word).
+ frequency = wordDict.getFrequency(charArray);
+ token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
+ frequency);
+ segGraph.addToken(token);
+ }
+
+ while (j < length && charTypeArray[j] == CharType.SPACE_LIKE)
+ j++;
+
+ if (j < length && charTypeArray[j] == CharType.HANZI) {
+ wordBuf.append(sentence.charAt(j));
+ charArray = new char[wordBuf.length()];
+ wordBuf.getChars(0, charArray.length, charArray, 0);
+ // idArray has been found (foundWordIndex!=-1) as a prefix before.
+ // Therefore, idArray after it has been lengthened can only appear after foundWordIndex.
+ // So start searching after foundWordIndex.
+ foundIndex = wordDict.getPrefixMatch(charArray, foundIndex);
+ j++;
+ } else {
+ break;
+ }
+ }
+ i++;
+ break;
+ case CharType.FULLWIDTH_LETTER:
+ hasFullWidth = true; /* intentional fallthrough */
+ case CharType.LETTER:
+ j = i + 1;
+ while (j < length
+ && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER)) {
+ if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
+ hasFullWidth = true;
+ j++;
+ }
+ // Found a Token from i to j. Type is LETTER char string.
+ charArray = Utility.STRING_CHAR_ARRAY;
+ frequency = wordDict.getFrequency(charArray);
+ wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
+ token = new SegToken(charArray, i, j, wordType, frequency);
+ segGraph.addToken(token);
+ i = j;
+ break;
+ case CharType.FULLWIDTH_DIGIT:
+ hasFullWidth = true; /* intentional fallthrough */
+ case CharType.DIGIT:
+ j = i + 1;
+ while (j < length
+ && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT)) {
+ if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
+ hasFullWidth = true;
+ j++;
+ }
+ // Found a Token from i to j. Type is NUMBER char string.
+ charArray = Utility.NUMBER_CHAR_ARRAY;
+ frequency = wordDict.getFrequency(charArray);
+ wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
+ token = new SegToken(charArray, i, j, wordType, frequency);
+ segGraph.addToken(token);
+ i = j;
+ break;
+ case CharType.DELIMITER:
+ j = i + 1;
+ // No need to search the weight for the punctuation. Picking the highest frequency will work.
+ frequency = Utility.MAX_FREQUENCE;
+ charArray = new char[] { sentence.charAt(i) };
+ token = new SegToken(charArray, i, j, WordType.DELIMITER, frequency);
+ segGraph.addToken(token);
+ i = j;
+ break;
+ default:
+ j = i + 1;
+ // Treat the unrecognized char symbol as unknown string.
+ // For example, any symbol not in GB2312 is treated as one of these.
+ charArray = Utility.STRING_CHAR_ARRAY;
+ frequency = wordDict.getFrequency(charArray);
+ token = new SegToken(charArray, i, j, WordType.STRING, frequency);
+ segGraph.addToken(token);
+ i = j;
+ break;
+ }
+ }
+
+ // Add two more Tokens: "beginning xx beginning"
+ charArray = Utility.START_CHAR_ARRAY;
+ frequency = wordDict.getFrequency(charArray);
+ token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency);
+ segGraph.addToken(token);
+
+ // "end xx end"
+ charArray = Utility.END_CHAR_ARRAY;
+ frequency = wordDict.getFrequency(charArray);
+ token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END,
+ frequency);
+ segGraph.addToken(token);
+
+ return segGraph;
+ }
+
+ /**
+ * Get the character types for every character in a sentence.
+ *
+ * @see Utility#getCharType(char)
+ * @param sentence input sentence
+ * @return array of character types corresponding to character positions in the sentence
+ */
+ private static int[] getCharTypes(String sentence) {
+ int length = sentence.length();
+ int[] charTypeArray = new int[length];
+ // the type of each character by position
+ for (int i = 0; i < length; i++) {
+ charTypeArray[i] = Utility.getCharType(sentence.charAt(i));
+ }
+
+ return charTypeArray;
+ }
+
+ /**
+ * Return a list of {@link SegToken} representing the best segmentation of a sentence
+ * @param sentence input sentence
+ * @return best segmentation as a {@link List}
+ */
+ public List<SegToken> process(String sentence) {
+ SegGraph segGraph = createSegGraph(sentence);
+ BiSegGraph biSegGraph = new BiSegGraph(segGraph);
+ List<SegToken> shortPath = biSegGraph.getShortPath();
+ return shortPath;
+ }
+}