+++ /dev/null
-package org.apache.lucene.search.spell;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Levenstein edit distance class.
- */
-public final class LevensteinDistance implements StringDistance {
-
- /**
- * Optimized to run a bit faster than the static getDistance().
- * In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster.
- */
- public LevensteinDistance () {
- }
-
-
- //*****************************
- // Compute Levenshtein distance: see org.apache.commons.lang.StringUtils#getLevenshteinDistance(String, String)
- //*****************************
- public float getDistance (String target, String other) {
- char[] sa;
- int n;
- int p[]; //'previous' cost array, horizontally
- int d[]; // cost array, horizontally
- int _d[]; //placeholder to assist in swapping p and d
-
- /*
- The difference between this impl. and the previous is that, rather
- than creating and retaining a matrix of size s.length()+1 by t.length()+1,
- we maintain two single-dimensional arrays of length s.length()+1. The first, d,
- is the 'current working' distance array that maintains the newest distance cost
- counts as we iterate through the characters of String s. Each time we increment
- the index of String t we are comparing, d is copied to p, the second int[]. Doing so
- allows us to retain the previous cost counts as required by the algorithm (taking
- the minimum of the cost count to the left, up one, and diagonally up and to the left
- of the current cost count being calculated). (Note that the arrays aren't really
- copied anymore, just switched...this is clearly much better than cloning an array
- or doing a System.arraycopy() each time through the outer loop.)
-
- Effectively, the difference between the two implementations is this one does not
- cause an out of memory condition when calculating the LD over two very large strings.
- */
-
- sa = target.toCharArray();
- n = sa.length;
- p = new int[n+1];
- d = new int[n+1];
-
- final int m = other.length();
- if (n == 0 || m == 0) {
- if (n == m) {
- return 1;
- }
- else {
- return 0;
- }
- }
-
-
- // indexes into strings s and t
- int i; // iterates through s
- int j; // iterates through t
-
- char t_j; // jth character of t
-
- int cost; // cost
-
- for (i = 0; i<=n; i++) {
- p[i] = i;
- }
-
- for (j = 1; j<=m; j++) {
- t_j = other.charAt(j-1);
- d[0] = j;
-
- for (i=1; i<=n; i++) {
- cost = sa[i-1]==t_j ? 0 : 1;
- // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
- d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]+cost);
- }
-
- // copy current distance counts to 'previous row' distance counts
- _d = p;
- p = d;
- d = _d;
- }
-
- // our last action in the above loop was to switch d and p, so p now
- // actually has the most recent cost counts
- return 1.0f - ((float) p[n] / Math.max(other.length(), sa.length));
- }
-
-}