1 package org.apache.lucene.search.spell;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 * N-Gram version of edit distance based on paper by Grzegorz Kondrak,
22 * "N-gram similarity and distance". Proceedings of the Twelfth International
23 * Conference on String Processing and Information Retrieval (SPIRE 2005), pp. 115-126,
24 * Buenos Aires, Argentina, November 2005.
25 * http://www.cs.ualberta.ca/~kondrak/papers/spire05.pdf
27 * This implementation uses the position-based optimization to compute partial
28 * matches of n-gram sub-strings and adds a null-character prefix of size n-1
29 * so that the first character is contained in the same number of n-grams as
30 * a middle character. Null-character prefix matches are discounted so that
31 * strings with no matching characters will return a distance of 0.
34 public class NGramDistance implements StringDistance {
39 * Creates an N-Gram distance measure using n-grams of the specified size.
40 * @param size The size of the n-gram to be used to compute the string distance.
42 public NGramDistance(int size) {
47 * Creates an N-Gram distance measure using n-grams of size 2.
49 public NGramDistance() {
53 public float getDistance(String source, String target) {
54 final int sl = source.length();
55 final int tl = target.length();
57 if (sl == 0 || tl == 0) {
67 if (sl < n || tl < n) {
68 for (int i=0,ni=Math.min(sl,tl);i<ni;i++) {
69 if (source.charAt(i) == target.charAt(i)) {
73 return (float) cost/Math.max(sl, tl);
76 char[] sa = new char[sl+n-1];
77 float p[]; //'previous' cost array, horizontally
78 float d[]; // cost array, horizontally
79 float _d[]; //placeholder to assist in swapping p and d
81 //construct sa with prefix
82 for (int i=0;i<sa.length;i++) {
87 sa[i] = source.charAt(i-n+1);
93 // indexes into strings s and t
94 int i; // iterates through source
95 int j; // iterates through target
97 char[] t_j = new char[n]; // jth n-gram of t
99 for (i = 0; i<=sl; i++) {
103 for (j = 1; j<=tl; j++) {
104 //construct t_j n-gram
106 for (int ti=0;ti<n-j;ti++) {
107 t_j[ti]=0; //add prefix
109 for (int ti=n-j;ti<n;ti++) {
110 t_j[ti]=target.charAt(ti-(n-j));
114 t_j = target.substring(j-n, j).toCharArray();
117 for (i=1; i<=sl; i++) {
121 for (int ni=0;ni<n;ni++) {
122 if (sa[i-1+ni] != t_j[ni]) {
125 else if (sa[i-1+ni] == 0) { //discount matches on prefix
129 float ec = (float) cost/tn;
130 // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
131 d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]+ec);
133 // copy current distance counts to 'previous row' distance counts
139 // our last action in the above loop was to switch d and p, so p now
140 // actually has the most recent cost counts
141 return 1.0f - (p[sl] / Math.max(tl, sl));