--- /dev/null
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cn.smart.hhmm;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.cn.smart.Utility;
+
+/**
+ * Graph representing possible token pairs (bigrams) at each start offset in the sentence.
+ * <p>
+ * For each start offset, a list of possible token pairs is stored.
+ * </p>
+ * @lucene.experimental
+ */
+class BiSegGraph {
+
+ private Map<Integer,ArrayList<SegTokenPair>> tokenPairListTable = new HashMap<Integer,ArrayList<SegTokenPair>>();
+
+ private List<SegToken> segTokenList;
+
+ private static BigramDictionary bigramDict = BigramDictionary.getInstance();
+
+ public BiSegGraph(SegGraph segGraph) {
+ segTokenList = segGraph.makeIndex();
+ generateBiSegGraph(segGraph);
+ }
+
+ /*
+ * Generate a BiSegGraph based upon a SegGraph
+ */
+ private void generateBiSegGraph(SegGraph segGraph) {
+ double smooth = 0.1;
+ int wordPairFreq = 0;
+ int maxStart = segGraph.getMaxStart();
+ double oneWordFreq, weight, tinyDouble = 1.0 / Utility.MAX_FREQUENCE;
+
+ int next;
+ char[] idBuffer;
+ // get the list of tokens ordered and indexed
+ segTokenList = segGraph.makeIndex();
+ // Because the beginning position of startToken is -1, therefore startToken can be obtained when key = -1
+ int key = -1;
+ List<SegToken> nextTokens = null;
+ while (key < maxStart) {
+ if (segGraph.isStartExist(key)) {
+
+ List<SegToken> tokenList = segGraph.getStartList(key);
+
+ // Calculate all tokens for a given key.
+ for (SegToken t1 : tokenList) {
+ oneWordFreq = t1.weight;
+ next = t1.endOffset;
+ nextTokens = null;
+ // Find the next corresponding Token.
+ // For example: "Sunny seashore", the present Token is "sunny", next one should be "sea" or "seashore".
+ // If we cannot find the next Token, then go to the end and repeat the same cycle.
+ while (next <= maxStart) {
+ // Because the beginning position of endToken is sentenceLen, so equal to sentenceLen can find endToken.
+ if (segGraph.isStartExist(next)) {
+ nextTokens = segGraph.getStartList(next);
+ break;
+ }
+ next++;
+ }
+ if (nextTokens == null) {
+ break;
+ }
+ for (SegToken t2 : nextTokens) {
+ idBuffer = new char[t1.charArray.length + t2.charArray.length + 1];
+ System.arraycopy(t1.charArray, 0, idBuffer, 0, t1.charArray.length);
+ idBuffer[t1.charArray.length] = BigramDictionary.WORD_SEGMENT_CHAR;
+ System.arraycopy(t2.charArray, 0, idBuffer,
+ t1.charArray.length + 1, t2.charArray.length);
+
+ // Two linked Words frequency
+ wordPairFreq = bigramDict.getFrequency(idBuffer);
+
+ // Smoothing
+
+ // -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
+ weight = -Math
+ .log(smooth
+ * (1.0 + oneWordFreq)
+ / (Utility.MAX_FREQUENCE + 0.0)
+ + (1.0 - smooth)
+ * ((1.0 - tinyDouble) * wordPairFreq / (1.0 + oneWordFreq) + tinyDouble));
+
+ SegTokenPair tokenPair = new SegTokenPair(idBuffer, t1.index,
+ t2.index, weight);
+ this.addSegTokenPair(tokenPair);
+ }
+ }
+ }
+ key++;
+ }
+
+ }
+
+ /**
+ * Returns true if their is a list of token pairs at this offset (index of the second token)
+ *
+ * @param to index of the second token in the token pair
+ * @return true if a token pair exists
+ */
+ public boolean isToExist(int to) {
+ return tokenPairListTable.get(Integer.valueOf(to)) != null;
+ }
+
+ /**
+ * Return a {@link List} of all token pairs at this offset (index of the second token)
+ *
+ * @param to index of the second token in the token pair
+ * @return {@link List} of token pairs.
+ */
+ public List<SegTokenPair> getToList(int to) {
+ return tokenPairListTable.get(to);
+ }
+
+ /**
+ * Add a {@link SegTokenPair}
+ *
+ * @param tokenPair {@link SegTokenPair}
+ */
+ public void addSegTokenPair(SegTokenPair tokenPair) {
+ int to = tokenPair.to;
+ if (!isToExist(to)) {
+ ArrayList<SegTokenPair> newlist = new ArrayList<SegTokenPair>();
+ newlist.add(tokenPair);
+ tokenPairListTable.put(to, newlist);
+ } else {
+ List<SegTokenPair> tokenPairList = tokenPairListTable.get(to);
+ tokenPairList.add(tokenPair);
+ }
+ }
+
+ /**
+ * Get the number of {@link SegTokenPair} entries in the table.
+ * @return number of {@link SegTokenPair} entries
+ */
+ public int getToCount() {
+ return tokenPairListTable.size();
+ }
+
+ /**
+ * Find the shortest path with the Viterbi algorithm.
+ * @return {@link List}
+ */
+ public List<SegToken> getShortPath() {
+ int current;
+ int nodeCount = getToCount();
+ List<PathNode> path = new ArrayList<PathNode>();
+ PathNode zeroPath = new PathNode();
+ zeroPath.weight = 0;
+ zeroPath.preNode = 0;
+ path.add(zeroPath);
+ for (current = 1; current <= nodeCount; current++) {
+ double weight;
+ List<SegTokenPair> edges = getToList(current);
+
+ double minWeight = Double.MAX_VALUE;
+ SegTokenPair minEdge = null;
+ for (SegTokenPair edge : edges) {
+ weight = edge.weight;
+ PathNode preNode = path.get(edge.from);
+ if (preNode.weight + weight < minWeight) {
+ minWeight = preNode.weight + weight;
+ minEdge = edge;
+ }
+ }
+ PathNode newNode = new PathNode();
+ newNode.weight = minWeight;
+ newNode.preNode = minEdge.from;
+ path.add(newNode);
+ }
+
+ // Calculate PathNodes
+ int preNode, lastNode;
+ lastNode = path.size() - 1;
+ current = lastNode;
+ List<Integer> rpath = new ArrayList<Integer>();
+ List<SegToken> resultPath = new ArrayList<SegToken>();
+
+ rpath.add(current);
+ while (current != 0) {
+ PathNode currentPathNode = path.get(current);
+ preNode = currentPathNode.preNode;
+ rpath.add(Integer.valueOf(preNode));
+ current = preNode;
+ }
+ for (int j = rpath.size() - 1; j >= 0; j--) {
+ Integer idInteger = rpath.get(j);
+ int id = idInteger.intValue();
+ SegToken t = segTokenList.get(id);
+ resultPath.add(t);
+ }
+ return resultPath;
+
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ Collection<ArrayList<SegTokenPair>> values = tokenPairListTable.values();
+ for (ArrayList<SegTokenPair> segList : values) {
+ for (SegTokenPair pair : segList) {
+ sb.append(pair + "\n");
+ }
+ }
+ return sb.toString();
+ }
+
+}