2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 package org.apache.lucene.analysis.cn.smart.hhmm;
20 import java.util.ArrayList;
21 import java.util.Collection;
22 import java.util.HashMap;
23 import java.util.List;
26 import org.apache.lucene.analysis.cn.smart.Utility;
29 * Graph representing possible token pairs (bigrams) at each start offset in the sentence.
31 * For each start offset, a list of possible token pairs is stored.
33 * @lucene.experimental
37 private Map<Integer,ArrayList<SegTokenPair>> tokenPairListTable = new HashMap<Integer,ArrayList<SegTokenPair>>();
39 private List<SegToken> segTokenList;
41 private static BigramDictionary bigramDict = BigramDictionary.getInstance();
43 public BiSegGraph(SegGraph segGraph) {
44 segTokenList = segGraph.makeIndex();
45 generateBiSegGraph(segGraph);
49 * Generate a BiSegGraph based upon a SegGraph
51 private void generateBiSegGraph(SegGraph segGraph) {
54 int maxStart = segGraph.getMaxStart();
55 double oneWordFreq, weight, tinyDouble = 1.0 / Utility.MAX_FREQUENCE;
59 // get the list of tokens ordered and indexed
60 segTokenList = segGraph.makeIndex();
61 // Because the beginning position of startToken is -1, therefore startToken can be obtained when key = -1
63 List<SegToken> nextTokens = null;
64 while (key < maxStart) {
65 if (segGraph.isStartExist(key)) {
67 List<SegToken> tokenList = segGraph.getStartList(key);
69 // Calculate all tokens for a given key.
70 for (SegToken t1 : tokenList) {
71 oneWordFreq = t1.weight;
74 // Find the next corresponding Token.
75 // For example: "Sunny seashore", the present Token is "sunny", next one should be "sea" or "seashore".
76 // If we cannot find the next Token, then go to the end and repeat the same cycle.
77 while (next <= maxStart) {
78 // Because the beginning position of endToken is sentenceLen, so equal to sentenceLen can find endToken.
79 if (segGraph.isStartExist(next)) {
80 nextTokens = segGraph.getStartList(next);
85 if (nextTokens == null) {
88 for (SegToken t2 : nextTokens) {
89 idBuffer = new char[t1.charArray.length + t2.charArray.length + 1];
90 System.arraycopy(t1.charArray, 0, idBuffer, 0, t1.charArray.length);
91 idBuffer[t1.charArray.length] = BigramDictionary.WORD_SEGMENT_CHAR;
92 System.arraycopy(t2.charArray, 0, idBuffer,
93 t1.charArray.length + 1, t2.charArray.length);
95 // Two linked Words frequency
96 wordPairFreq = bigramDict.getFrequency(idBuffer);
100 // -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
103 * (1.0 + oneWordFreq)
104 / (Utility.MAX_FREQUENCE + 0.0)
106 * ((1.0 - tinyDouble) * wordPairFreq / (1.0 + oneWordFreq) + tinyDouble));
108 SegTokenPair tokenPair = new SegTokenPair(idBuffer, t1.index,
110 this.addSegTokenPair(tokenPair);
120 * Returns true if their is a list of token pairs at this offset (index of the second token)
122 * @param to index of the second token in the token pair
123 * @return true if a token pair exists
125 public boolean isToExist(int to) {
126 return tokenPairListTable.get(Integer.valueOf(to)) != null;
130 * Return a {@link List} of all token pairs at this offset (index of the second token)
132 * @param to index of the second token in the token pair
133 * @return {@link List} of token pairs.
135 public List<SegTokenPair> getToList(int to) {
136 return tokenPairListTable.get(to);
140 * Add a {@link SegTokenPair}
142 * @param tokenPair {@link SegTokenPair}
144 public void addSegTokenPair(SegTokenPair tokenPair) {
145 int to = tokenPair.to;
146 if (!isToExist(to)) {
147 ArrayList<SegTokenPair> newlist = new ArrayList<SegTokenPair>();
148 newlist.add(tokenPair);
149 tokenPairListTable.put(to, newlist);
151 List<SegTokenPair> tokenPairList = tokenPairListTable.get(to);
152 tokenPairList.add(tokenPair);
157 * Get the number of {@link SegTokenPair} entries in the table.
158 * @return number of {@link SegTokenPair} entries
160 public int getToCount() {
161 return tokenPairListTable.size();
165 * Find the shortest path with the Viterbi algorithm.
166 * @return {@link List}
168 public List<SegToken> getShortPath() {
170 int nodeCount = getToCount();
171 List<PathNode> path = new ArrayList<PathNode>();
172 PathNode zeroPath = new PathNode();
174 zeroPath.preNode = 0;
176 for (current = 1; current <= nodeCount; current++) {
178 List<SegTokenPair> edges = getToList(current);
180 double minWeight = Double.MAX_VALUE;
181 SegTokenPair minEdge = null;
182 for (SegTokenPair edge : edges) {
183 weight = edge.weight;
184 PathNode preNode = path.get(edge.from);
185 if (preNode.weight + weight < minWeight) {
186 minWeight = preNode.weight + weight;
190 PathNode newNode = new PathNode();
191 newNode.weight = minWeight;
192 newNode.preNode = minEdge.from;
196 // Calculate PathNodes
197 int preNode, lastNode;
198 lastNode = path.size() - 1;
200 List<Integer> rpath = new ArrayList<Integer>();
201 List<SegToken> resultPath = new ArrayList<SegToken>();
204 while (current != 0) {
205 PathNode currentPathNode = path.get(current);
206 preNode = currentPathNode.preNode;
207 rpath.add(Integer.valueOf(preNode));
210 for (int j = rpath.size() - 1; j >= 0; j--) {
211 Integer idInteger = rpath.get(j);
212 int id = idInteger.intValue();
213 SegToken t = segTokenList.get(id);
221 public String toString() {
222 StringBuilder sb = new StringBuilder();
223 Collection<ArrayList<SegTokenPair>> values = tokenPairListTable.values();
224 for (ArrayList<SegTokenPair> segList : values) {
225 for (SegTokenPair pair : segList) {
226 sb.append(pair + "\n");
229 return sb.toString();