X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java diff --git a/lucene-java-3.4.0/lucene/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java b/lucene-java-3.4.0/lucene/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java deleted file mode 100644 index a40dcb5..0000000 --- a/lucene-java-3.4.0/lucene/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java +++ /dev/null @@ -1,206 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.analysis.cn.smart.hhmm; - -import java.util.List; - -import org.apache.lucene.analysis.cn.smart.CharType; -import org.apache.lucene.analysis.cn.smart.Utility; -import org.apache.lucene.analysis.cn.smart.WordType; -import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;//javadoc @link - -/** - * Finds the optimal segmentation of a sentence into Chinese words - * @lucene.experimental - */ -public class HHMMSegmenter { - - private static WordDictionary wordDict = WordDictionary.getInstance(); - - /** - * Create the {@link SegGraph} for a sentence. - * - * @param sentence input sentence, without start and end markers - * @return {@link SegGraph} corresponding to the input sentence. - */ - @SuppressWarnings("fallthrough") - private SegGraph createSegGraph(String sentence) { - int i = 0, j; - int length = sentence.length(); - int foundIndex; - int[] charTypeArray = getCharTypes(sentence); - StringBuilder wordBuf = new StringBuilder(); - SegToken token; - int frequency = 0; // the number of times word appears. - boolean hasFullWidth; - int wordType; - char[] charArray; - - SegGraph segGraph = new SegGraph(); - while (i < length) { - hasFullWidth = false; - switch (charTypeArray[i]) { - case CharType.SPACE_LIKE: - i++; - break; - case CharType.HANZI: - j = i + 1; - wordBuf.delete(0, wordBuf.length()); - // It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not, - // it will store that single Chinese character (Hanzi) in the SegGraph. Otherwise, it will - // cause word division. - wordBuf.append(sentence.charAt(i)); - charArray = new char[] { sentence.charAt(i) }; - frequency = wordDict.getFrequency(charArray); - token = new SegToken(charArray, i, j, WordType.CHINESE_WORD, - frequency); - segGraph.addToken(token); - - foundIndex = wordDict.getPrefixMatch(charArray); - while (j <= length && foundIndex != -1) { - if (wordDict.isEqual(charArray, foundIndex) && charArray.length > 1) { - // It is the phrase we are looking for; In other words, we have found a phrase SegToken - // from i to j. It is not a monosyllabic word (single word). - frequency = wordDict.getFrequency(charArray); - token = new SegToken(charArray, i, j, WordType.CHINESE_WORD, - frequency); - segGraph.addToken(token); - } - - while (j < length && charTypeArray[j] == CharType.SPACE_LIKE) - j++; - - if (j < length && charTypeArray[j] == CharType.HANZI) { - wordBuf.append(sentence.charAt(j)); - charArray = new char[wordBuf.length()]; - wordBuf.getChars(0, charArray.length, charArray, 0); - // idArray has been found (foundWordIndex!=-1) as a prefix before. - // Therefore, idArray after it has been lengthened can only appear after foundWordIndex. - // So start searching after foundWordIndex. - foundIndex = wordDict.getPrefixMatch(charArray, foundIndex); - j++; - } else { - break; - } - } - i++; - break; - case CharType.FULLWIDTH_LETTER: - hasFullWidth = true; /* intentional fallthrough */ - case CharType.LETTER: - j = i + 1; - while (j < length - && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER)) { - if (charTypeArray[j] == CharType.FULLWIDTH_LETTER) - hasFullWidth = true; - j++; - } - // Found a Token from i to j. Type is LETTER char string. - charArray = Utility.STRING_CHAR_ARRAY; - frequency = wordDict.getFrequency(charArray); - wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING; - token = new SegToken(charArray, i, j, wordType, frequency); - segGraph.addToken(token); - i = j; - break; - case CharType.FULLWIDTH_DIGIT: - hasFullWidth = true; /* intentional fallthrough */ - case CharType.DIGIT: - j = i + 1; - while (j < length - && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT)) { - if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT) - hasFullWidth = true; - j++; - } - // Found a Token from i to j. Type is NUMBER char string. - charArray = Utility.NUMBER_CHAR_ARRAY; - frequency = wordDict.getFrequency(charArray); - wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER; - token = new SegToken(charArray, i, j, wordType, frequency); - segGraph.addToken(token); - i = j; - break; - case CharType.DELIMITER: - j = i + 1; - // No need to search the weight for the punctuation. Picking the highest frequency will work. - frequency = Utility.MAX_FREQUENCE; - charArray = new char[] { sentence.charAt(i) }; - token = new SegToken(charArray, i, j, WordType.DELIMITER, frequency); - segGraph.addToken(token); - i = j; - break; - default: - j = i + 1; - // Treat the unrecognized char symbol as unknown string. - // For example, any symbol not in GB2312 is treated as one of these. - charArray = Utility.STRING_CHAR_ARRAY; - frequency = wordDict.getFrequency(charArray); - token = new SegToken(charArray, i, j, WordType.STRING, frequency); - segGraph.addToken(token); - i = j; - break; - } - } - - // Add two more Tokens: "beginning xx beginning" - charArray = Utility.START_CHAR_ARRAY; - frequency = wordDict.getFrequency(charArray); - token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency); - segGraph.addToken(token); - - // "end xx end" - charArray = Utility.END_CHAR_ARRAY; - frequency = wordDict.getFrequency(charArray); - token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END, - frequency); - segGraph.addToken(token); - - return segGraph; - } - - /** - * Get the character types for every character in a sentence. - * - * @see Utility#getCharType(char) - * @param sentence input sentence - * @return array of character types corresponding to character positions in the sentence - */ - private static int[] getCharTypes(String sentence) { - int length = sentence.length(); - int[] charTypeArray = new int[length]; - // the type of each character by position - for (int i = 0; i < length; i++) { - charTypeArray[i] = Utility.getCharType(sentence.charAt(i)); - } - - return charTypeArray; - } - - /** - * Return a list of {@link SegToken} representing the best segmentation of a sentence - * @param sentence input sentence - * @return best segmentation as a {@link List} - */ - public List process(String sentence) { - SegGraph segGraph = createSegGraph(sentence); - BiSegGraph biSegGraph = new BiSegGraph(segGraph); - List shortPath = biSegGraph.getShortPath(); - return shortPath; - } -}