lucene-java-3.4.0/lucene/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java

   1 /**
   2  * Licensed to the Apache Software Foundation (ASF) under one or more
   3  * contributor license agreements.  See the NOTICE file distributed with
   4  * this work for additional information regarding copyright ownership.
   5  * The ASF licenses this file to You under the Apache License, Version 2.0
   6  * (the "License"); you may not use this file except in compliance with
   7  * the License.  You may obtain a copy of the License at
   8  *
   9  *     http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 package org.apache.lucene.analysis.cn.smart.hhmm;
  19
  20 import java.util.ArrayList;
  21 import java.util.Collection;
  22 import java.util.HashMap;
  23 import java.util.List;
  24 import java.util.Map;
  25
  26 import org.apache.lucene.analysis.cn.smart.Utility;
  27
  28 /**
  29  * Graph representing possible token pairs (bigrams) at each start offset in the sentence.
  30  * <p>
  31  * For each start offset, a list of possible token pairs is stored.
  32  * </p>
  33  * @lucene.experimental
  34  */
  35 class BiSegGraph {
  36
  37   private Map<Integer,ArrayList<SegTokenPair>> tokenPairListTable = new HashMap<Integer,ArrayList<SegTokenPair>>();
  38
  39   private List<SegToken> segTokenList;
  40
  41   private static BigramDictionary bigramDict = BigramDictionary.getInstance();
  42
  43   public BiSegGraph(SegGraph segGraph) {
  44     segTokenList = segGraph.makeIndex();
  45     generateBiSegGraph(segGraph);
  46   }
  47
  48   /*
  49    * Generate a BiSegGraph based upon a SegGraph
  50    */
  51   private void generateBiSegGraph(SegGraph segGraph) {
  52     double smooth = 0.1;
  53     int wordPairFreq = 0;
  54     int maxStart = segGraph.getMaxStart();
  55     double oneWordFreq, weight, tinyDouble = 1.0 / Utility.MAX_FREQUENCE;
  56
  57     int next;
  58     char[] idBuffer;
  59     // get the list of tokens ordered and indexed
  60     segTokenList = segGraph.makeIndex();
  61     // Because the beginning position of startToken is -1, therefore startToken can be obtained when key = -1
  62     int key = -1;
  63     List<SegToken> nextTokens = null;
  64     while (key < maxStart) {
  65       if (segGraph.isStartExist(key)) {
  66
  67         List<SegToken> tokenList = segGraph.getStartList(key);
  68
  69         // Calculate all tokens for a given key.
  70         for (SegToken t1 : tokenList) {
  71           oneWordFreq = t1.weight;
  72           next = t1.endOffset;
  73           nextTokens = null;
  74           // Find the next corresponding Token.
  75           // For example: "Sunny seashore", the present Token is "sunny", next one should be "sea" or "seashore".
  76           // If we cannot find the next Token, then go to the end and repeat the same cycle.
  77           while (next <= maxStart) {
  78             // Because the beginning position of endToken is sentenceLen, so equal to sentenceLen can find endToken.
  79             if (segGraph.isStartExist(next)) {
  80               nextTokens = segGraph.getStartList(next);
  81               break;
  82             }
  83             next++;
  84           }
  85           if (nextTokens == null) {
  86             break;
  87           }
  88           for (SegToken t2 : nextTokens) {
  89             idBuffer = new char[t1.charArray.length + t2.charArray.length + 1];
  90             System.arraycopy(t1.charArray, 0, idBuffer, 0, t1.charArray.length);
  91             idBuffer[t1.charArray.length] = BigramDictionary.WORD_SEGMENT_CHAR;
  92             System.arraycopy(t2.charArray, 0, idBuffer,
  93                 t1.charArray.length + 1, t2.charArray.length);
  94
  95             // Two linked Words frequency
  96             wordPairFreq = bigramDict.getFrequency(idBuffer);
  97
  98             // Smoothing
  99
 100             // -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
 101             weight = -Math
 102                 .log(smooth
 103                     * (1.0 + oneWordFreq)
 104                     / (Utility.MAX_FREQUENCE + 0.0)
 105                     + (1.0 - smooth)
 106                     * ((1.0 - tinyDouble) * wordPairFreq / (1.0 + oneWordFreq) + tinyDouble));
 107
 108             SegTokenPair tokenPair = new SegTokenPair(idBuffer, t1.index,
 109                 t2.index, weight);
 110             this.addSegTokenPair(tokenPair);
 111           }
 112         }
 113       }
 114       key++;
 115     }
 116
 117   }
 118
 119   /**
 120    * Returns true if their is a list of token pairs at this offset (index of the second token)
 121    *
 122    * @param to index of the second token in the token pair
 123    * @return true if a token pair exists
 124    */
 125   public boolean isToExist(int to) {
 126     return tokenPairListTable.get(Integer.valueOf(to)) != null;
 127   }
 128
 129   /**
 130    * Return a {@link List} of all token pairs at this offset (index of the second token)
 131    *
 132    * @param to index of the second token in the token pair
 133    * @return {@link List} of token pairs.
 134    */
 135   public List<SegTokenPair> getToList(int to) {
 136     return tokenPairListTable.get(to);
 137   }
 138
 139   /**
 140    * Add a {@link SegTokenPair}
 141    *
 142    * @param tokenPair {@link SegTokenPair}
 143    */
 144   public void addSegTokenPair(SegTokenPair tokenPair) {
 145     int to = tokenPair.to;
 146     if (!isToExist(to)) {
 147       ArrayList<SegTokenPair> newlist = new ArrayList<SegTokenPair>();
 148       newlist.add(tokenPair);
 149       tokenPairListTable.put(to, newlist);
 150     } else {
 151       List<SegTokenPair> tokenPairList = tokenPairListTable.get(to);
 152       tokenPairList.add(tokenPair);
 153     }
 154   }
 155
 156   /**
 157    * Get the number of {@link SegTokenPair} entries in the table.
 158    * @return number of {@link SegTokenPair} entries
 159    */
 160   public int getToCount() {
 161     return tokenPairListTable.size();
 162   }
 163
 164   /**
 165    * Find the shortest path with the Viterbi algorithm.
 166    * @return {@link List}
 167    */
 168   public List<SegToken> getShortPath() {
 169     int current;
 170     int nodeCount = getToCount();
 171     List<PathNode> path = new ArrayList<PathNode>();
 172     PathNode zeroPath = new PathNode();
 173     zeroPath.weight = 0;
 174     zeroPath.preNode = 0;
 175     path.add(zeroPath);
 176     for (current = 1; current <= nodeCount; current++) {
 177       double weight;
 178       List<SegTokenPair> edges = getToList(current);
 179
 180       double minWeight = Double.MAX_VALUE;
 181       SegTokenPair minEdge = null;
 182       for (SegTokenPair edge : edges) {
 183         weight = edge.weight;
 184         PathNode preNode = path.get(edge.from);
 185         if (preNode.weight + weight < minWeight) {
 186           minWeight = preNode.weight + weight;
 187           minEdge = edge;
 188         }
 189       }
 190       PathNode newNode = new PathNode();
 191       newNode.weight = minWeight;
 192       newNode.preNode = minEdge.from;
 193       path.add(newNode);
 194     }
 195
 196     // Calculate PathNodes
 197     int preNode, lastNode;
 198     lastNode = path.size() - 1;
 199     current = lastNode;
 200     List<Integer> rpath = new ArrayList<Integer>();
 201     List<SegToken> resultPath = new ArrayList<SegToken>();
 202
 203     rpath.add(current);
 204     while (current != 0) {
 205       PathNode currentPathNode = path.get(current);
 206       preNode = currentPathNode.preNode;
 207       rpath.add(Integer.valueOf(preNode));
 208       current = preNode;
 209     }
 210     for (int j = rpath.size() - 1; j >= 0; j--) {
 211       Integer idInteger = rpath.get(j);
 212       int id = idInteger.intValue();
 213       SegToken t = segTokenList.get(id);
 214       resultPath.add(t);
 215     }
 216     return resultPath;
 217
 218   }
 219
 220   @Override
 221   public String toString() {
 222     StringBuilder sb = new StringBuilder();
 223     Collection<ArrayList<SegTokenPair>>  values = tokenPairListTable.values();
 224     for (ArrayList<SegTokenPair> segList : values) {
 225       for (SegTokenPair pair : segList) {
 226         sb.append(pair + "\n");
 227       }
 228     }
 229     return sb.toString();
 230   }
 231
 232 }