1 package org.apache.lucene.search;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.util.List;
21 import java.util.Arrays;
22 import java.io.IOException;
24 import org.apache.lucene.analysis.MockAnalyzer;
25 import org.apache.lucene.analysis.standard.StandardAnalyzer;
26 import org.apache.lucene.util.LuceneTestCase;
27 import org.apache.lucene.document.Document;
28 import org.apache.lucene.document.Field;
29 import org.apache.lucene.index.IndexReader;
30 import org.apache.lucene.index.MultiReader;
31 import org.apache.lucene.index.RandomIndexWriter;
32 import org.apache.lucene.index.Term;
33 import org.apache.lucene.store.Directory;
34 import org.apache.lucene.queryParser.QueryParser;
37 * Tests {@link FuzzyQuery}.
40 public class TestFuzzyQuery extends LuceneTestCase {
42 public void testFuzziness() throws Exception {
43 Directory directory = newDirectory();
44 RandomIndexWriter writer = new RandomIndexWriter(random, directory);
45 addDoc("aaaaa", writer);
46 addDoc("aaaab", writer);
47 addDoc("aaabb", writer);
48 addDoc("aabbb", writer);
49 addDoc("abbbb", writer);
50 addDoc("bbbbb", writer);
51 addDoc("ddddd", writer);
53 IndexReader reader = writer.getReader();
54 IndexSearcher searcher = newSearcher(reader);
57 FuzzyQuery query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 0);
58 ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
59 assertEquals(3, hits.length);
62 query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 1);
63 hits = searcher.search(query, null, 1000).scoreDocs;
64 assertEquals(3, hits.length);
65 query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 2);
66 hits = searcher.search(query, null, 1000).scoreDocs;
67 assertEquals(3, hits.length);
68 query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 3);
69 hits = searcher.search(query, null, 1000).scoreDocs;
70 assertEquals(3, hits.length);
71 query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 4);
72 hits = searcher.search(query, null, 1000).scoreDocs;
73 assertEquals(2, hits.length);
74 query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 5);
75 hits = searcher.search(query, null, 1000).scoreDocs;
76 assertEquals(1, hits.length);
77 query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 6);
78 hits = searcher.search(query, null, 1000).scoreDocs;
79 assertEquals(1, hits.length);
82 query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0);
83 hits = searcher.search(query, null, 1000).scoreDocs;
84 assertEquals("3 documents should match", 3, hits.length);
85 List<String> order = Arrays.asList("bbbbb","abbbb","aabbb");
86 for (int i = 0; i < hits.length; i++) {
87 final String term = searcher.doc(hits[i].doc).get("field");
88 //System.out.println(hits[i].score);
89 assertEquals(order.get(i), term);
92 // test pq size by supplying maxExpansions=2
93 // This query would normally return 3 documents, because 3 terms match (see above):
94 query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0, 2);
95 hits = searcher.search(query, null, 1000).scoreDocs;
96 assertEquals("only 2 documents should match", 2, hits.length);
97 order = Arrays.asList("bbbbb","abbbb");
98 for (int i = 0; i < hits.length; i++) {
99 final String term = searcher.doc(hits[i].doc).get("field");
100 //System.out.println(hits[i].score);
101 assertEquals(order.get(i), term);
104 // not similar enough:
105 query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0);
106 hits = searcher.search(query, null, 1000).scoreDocs;
107 assertEquals(0, hits.length);
108 query = new FuzzyQuery(new Term("field", "aaccc"), FuzzyQuery.defaultMinSimilarity, 0); // edit distance to "aaaaa" = 3
109 hits = searcher.search(query, null, 1000).scoreDocs;
110 assertEquals(0, hits.length);
112 // query identical to a word in the index:
113 query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 0);
114 hits = searcher.search(query, null, 1000).scoreDocs;
115 assertEquals(3, hits.length);
116 assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
117 // default allows for up to two edits:
118 assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
119 assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
121 // query similar to a word in the index:
122 query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 0);
123 hits = searcher.search(query, null, 1000).scoreDocs;
124 assertEquals(3, hits.length);
125 assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
126 assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
127 assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
130 query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 1);
131 hits = searcher.search(query, null, 1000).scoreDocs;
132 assertEquals(3, hits.length);
133 assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
134 assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
135 assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
136 query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 2);
137 hits = searcher.search(query, null, 1000).scoreDocs;
138 assertEquals(3, hits.length);
139 assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
140 assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
141 assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
142 query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 3);
143 hits = searcher.search(query, null, 1000).scoreDocs;
144 assertEquals(3, hits.length);
145 assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
146 assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
147 assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
148 query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 4);
149 hits = searcher.search(query, null, 1000).scoreDocs;
150 assertEquals(2, hits.length);
151 assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
152 assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
153 query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 5);
154 hits = searcher.search(query, null, 1000).scoreDocs;
155 assertEquals(0, hits.length);
158 query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 0);
159 hits = searcher.search(query, null, 1000).scoreDocs;
160 assertEquals(1, hits.length);
161 assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
164 query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 1);
165 hits = searcher.search(query, null, 1000).scoreDocs;
166 assertEquals(1, hits.length);
167 assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
168 query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 2);
169 hits = searcher.search(query, null, 1000).scoreDocs;
170 assertEquals(1, hits.length);
171 assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
172 query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 3);
173 hits = searcher.search(query, null, 1000).scoreDocs;
174 assertEquals(1, hits.length);
175 assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
176 query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 4);
177 hits = searcher.search(query, null, 1000).scoreDocs;
178 assertEquals(1, hits.length);
179 assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
180 query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 5);
181 hits = searcher.search(query, null, 1000).scoreDocs;
182 assertEquals(0, hits.length);
185 // different field = no match:
186 query = new FuzzyQuery(new Term("anotherfield", "ddddX"), FuzzyQuery.defaultMinSimilarity, 0);
187 hits = searcher.search(query, null, 1000).scoreDocs;
188 assertEquals(0, hits.length);
195 public void testFuzzinessLong() throws Exception {
196 Directory directory = newDirectory();
197 RandomIndexWriter writer = new RandomIndexWriter(random, directory);
198 addDoc("aaaaaaa", writer);
199 addDoc("segment", writer);
201 IndexReader reader = writer.getReader();
202 IndexSearcher searcher = newSearcher(reader);
206 // not similar enough:
207 query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0);
208 ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
209 assertEquals(0, hits.length);
210 // edit distance to "aaaaaaa" = 3, this matches because the string is longer than
211 // in testDefaultFuzziness so a bigger difference is allowed:
212 query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 0);
213 hits = searcher.search(query, null, 1000).scoreDocs;
214 assertEquals(1, hits.length);
215 assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
218 query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 1);
219 hits = searcher.search(query, null, 1000).scoreDocs;
220 assertEquals(1, hits.length);
221 assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
222 query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 4);
223 hits = searcher.search(query, null, 1000).scoreDocs;
224 assertEquals(1, hits.length);
225 assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
226 query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 5);
227 hits = searcher.search(query, null, 1000).scoreDocs;
228 assertEquals(0, hits.length);
230 // no match, more than half of the characters is wrong:
231 query = new FuzzyQuery(new Term("field", "aaacccc"), FuzzyQuery.defaultMinSimilarity, 0);
232 hits = searcher.search(query, null, 1000).scoreDocs;
233 assertEquals(0, hits.length);
236 query = new FuzzyQuery(new Term("field", "aaacccc"), FuzzyQuery.defaultMinSimilarity, 2);
237 hits = searcher.search(query, null, 1000).scoreDocs;
238 assertEquals(0, hits.length);
240 // "student" and "stellent" are indeed similar to "segment" by default:
241 query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 0);
242 hits = searcher.search(query, null, 1000).scoreDocs;
243 assertEquals(1, hits.length);
244 query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 0);
245 hits = searcher.search(query, null, 1000).scoreDocs;
246 assertEquals(1, hits.length);
249 query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 1);
250 hits = searcher.search(query, null, 1000).scoreDocs;
251 assertEquals(1, hits.length);
252 query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 1);
253 hits = searcher.search(query, null, 1000).scoreDocs;
254 assertEquals(1, hits.length);
255 query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 2);
256 hits = searcher.search(query, null, 1000).scoreDocs;
257 assertEquals(0, hits.length);
258 query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 2);
259 hits = searcher.search(query, null, 1000).scoreDocs;
260 assertEquals(0, hits.length);
262 // "student" doesn't match anymore thanks to increased minimum similarity:
263 query = new FuzzyQuery(new Term("field", "student"), 0.6f, 0);
264 hits = searcher.search(query, null, 1000).scoreDocs;
265 assertEquals(0, hits.length);
268 query = new FuzzyQuery(new Term("field", "student"), 1.1f);
269 fail("Expected IllegalArgumentException");
270 } catch (IllegalArgumentException e) {
271 // expecting exception
274 query = new FuzzyQuery(new Term("field", "student"), -0.1f);
275 fail("Expected IllegalArgumentException");
276 } catch (IllegalArgumentException e) {
277 // expecting exception
285 public void testTokenLengthOpt() throws IOException {
286 Directory directory = newDirectory();
287 RandomIndexWriter writer = new RandomIndexWriter(random, directory);
288 addDoc("12345678911", writer);
289 addDoc("segment", writer);
291 IndexReader reader = writer.getReader();
292 IndexSearcher searcher = newSearcher(reader);
296 // term not over 10 chars, so optimization shortcuts
297 query = new FuzzyQuery(new Term("field", "1234569"), 0.9f);
298 ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
299 assertEquals(0, hits.length);
301 // 10 chars, so no optimization
302 query = new FuzzyQuery(new Term("field", "1234567891"), 0.9f);
303 hits = searcher.search(query, null, 1000).scoreDocs;
304 assertEquals(0, hits.length);
306 // over 10 chars, so no optimization
307 query = new FuzzyQuery(new Term("field", "12345678911"), 0.9f);
308 hits = searcher.search(query, null, 1000).scoreDocs;
309 assertEquals(1, hits.length);
311 // over 10 chars, no match
312 query = new FuzzyQuery(new Term("field", "sdfsdfsdfsdf"), 0.9f);
313 hits = searcher.search(query, null, 1000).scoreDocs;
314 assertEquals(0, hits.length);
321 /** Test the TopTermsBoostOnlyBooleanQueryRewrite rewrite method. */
322 public void testBoostOnlyRewrite() throws Exception {
323 Directory directory = newDirectory();
324 RandomIndexWriter writer = new RandomIndexWriter(random, directory);
325 addDoc("Lucene", writer);
326 addDoc("Lucene", writer);
327 addDoc("Lucenne", writer);
329 IndexReader reader = writer.getReader();
330 IndexSearcher searcher = newSearcher(reader);
333 FuzzyQuery query = new FuzzyQuery(new Term("field", "Lucene"));
334 query.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(50));
335 ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
336 assertEquals(3, hits.length);
337 // normally, 'Lucenne' would be the first result as IDF will skew the score.
338 assertEquals("Lucene", reader.document(hits[0].doc).get("field"));
339 assertEquals("Lucene", reader.document(hits[1].doc).get("field"));
340 assertEquals("Lucenne", reader.document(hits[2].doc).get("field"));
346 public void testGiga() throws Exception {
348 MockAnalyzer analyzer = new MockAnalyzer(random);
349 Directory index = newDirectory();
350 RandomIndexWriter w = new RandomIndexWriter(random, index);
352 addDoc("Lucene in Action", w);
353 addDoc("Lucene for Dummies", w);
356 addDoc("Giga byte", w);
358 addDoc("ManagingGigabytesManagingGigabyte", w);
359 addDoc("ManagingGigabytesManagingGigabytes", w);
361 addDoc("The Art of Computer Science", w);
362 addDoc("J. K. Rowling", w);
363 addDoc("JK Rowling", w);
364 addDoc("Joanne K Roling", w);
365 addDoc("Bruce Willis", w);
366 addDoc("Willis bruce", w);
367 addDoc("Brute willis", w);
368 addDoc("B. willis", w);
369 IndexReader r = w.getReader();
372 Query q = new QueryParser(TEST_VERSION_CURRENT, "field", analyzer).parse( "giga~0.9" );
375 IndexSearcher searcher = newSearcher(r);
376 ScoreDoc[] hits = searcher.search(q, 10).scoreDocs;
377 assertEquals(1, hits.length);
378 assertEquals("Giga byte", searcher.doc(hits[0].doc).get("field"));
384 private void addDoc(String text, RandomIndexWriter writer) throws IOException {
385 Document doc = new Document();
386 doc.add(newField("field", text, Field.Store.YES, Field.Index.ANALYZED));
387 writer.addDocument(doc);