pylucene 3.5.0-3

[pylucene.git] / lucene-java-3.5.0 / lucene / contrib / spellchecker / src / java / org / apache / lucene / search / spell / LevensteinDistance.java
diff --git a/lucene-java-3.5.0/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LevensteinDistance.java b/lucene-java-3.5.0/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LevensteinDistance.java

new file mode 100755 (executable)

index 0000000..c599925
--- /dev/null
+++ b/lucene-java-3.5.0/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LevensteinDistance.java
@@ -0,0 +1,109 @@
+package org.apache.lucene.search.spell;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Levenstein edit distance class.
+ */
+public final class LevensteinDistance implements StringDistance {
+
+    /**
+     * Optimized to run a bit faster than the static getDistance().
+     * In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster.
+     */
+    public LevensteinDistance () {
+    }
+
+
+    //*****************************
+    // Compute Levenshtein distance: see org.apache.commons.lang.StringUtils#getLevenshteinDistance(String, String)
+    //*****************************
+    public float getDistance (String target, String other) {
+      char[] sa;
+      int n;
+      int p[]; //'previous' cost array, horizontally
+      int d[]; // cost array, horizontally
+      int _d[]; //placeholder to assist in swapping p and d
+      
+        /*
+           The difference between this impl. and the previous is that, rather
+           than creating and retaining a matrix of size s.length()+1 by t.length()+1,
+           we maintain two single-dimensional arrays of length s.length()+1.  The first, d,
+           is the 'current working' distance array that maintains the newest distance cost
+           counts as we iterate through the characters of String s.  Each time we increment
+           the index of String t we are comparing, d is copied to p, the second int[].  Doing so
+           allows us to retain the previous cost counts as required by the algorithm (taking
+           the minimum of the cost count to the left, up one, and diagonally up and to the left
+           of the current cost count being calculated).  (Note that the arrays aren't really
+           copied anymore, just switched...this is clearly much better than cloning an array
+           or doing a System.arraycopy() each time  through the outer loop.)
+
+           Effectively, the difference between the two implementations is this one does not
+           cause an out of memory condition when calculating the LD over two very large strings.
+         */
+
+        sa = target.toCharArray();
+        n = sa.length;
+        p = new int[n+1]; 
+        d = new int[n+1]; 
+      
+        final int m = other.length();
+        if (n == 0 || m == 0) {
+          if (n == m) {
+            return 1;
+          }
+          else {
+            return 0;
+          }
+        } 
+
+
+        // indexes into strings s and t
+        int i; // iterates through s
+        int j; // iterates through t
+
+        char t_j; // jth character of t
+
+        int cost; // cost
+
+        for (i = 0; i<=n; i++) {
+            p[i] = i;
+        }
+
+        for (j = 1; j<=m; j++) {
+            t_j = other.charAt(j-1);
+            d[0] = j;
+
+            for (i=1; i<=n; i++) {
+                cost = sa[i-1]==t_j ? 0 : 1;
+                // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
+                d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1),  p[i-1]+cost);
+            }
+
+            // copy current distance counts to 'previous row' distance counts
+            _d = p;
+            p = d;
+            d = _d;
+        }
+
+        // our last action in the above loop was to switch d and p, so p now
+        // actually has the most recent cost counts
+        return 1.0f - ((float) p[n] / Math.max(other.length(), sa.length));
+    }
+
+}