pylucene 3.5.0-3
[pylucene.git] / lucene-java-3.5.0 / lucene / backwards / src / test / org / apache / lucene / search / TestFuzzyQuery.java
1 package org.apache.lucene.search;
2
3 /**
4  * Licensed to the Apache Software Foundation (ASF) under one or more
5  * contributor license agreements.  See the NOTICE file distributed with
6  * this work for additional information regarding copyright ownership.
7  * The ASF licenses this file to You under the Apache License, Version 2.0
8  * (the "License"); you may not use this file except in compliance with
9  * the License.  You may obtain a copy of the License at
10  *
11  *     http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19
20 import java.util.List;
21 import java.util.Arrays;
22 import java.io.IOException;
23
24 import org.apache.lucene.analysis.MockAnalyzer;
25 import org.apache.lucene.analysis.standard.StandardAnalyzer;
26 import org.apache.lucene.util.LuceneTestCase;
27 import org.apache.lucene.document.Document;
28 import org.apache.lucene.document.Field;
29 import org.apache.lucene.index.IndexReader;
30 import org.apache.lucene.index.MultiReader;
31 import org.apache.lucene.index.RandomIndexWriter;
32 import org.apache.lucene.index.Term;
33 import org.apache.lucene.store.Directory;
34 import org.apache.lucene.queryParser.QueryParser;
35
36 /**
37  * Tests {@link FuzzyQuery}.
38  *
39  */
40 public class TestFuzzyQuery extends LuceneTestCase {
41
42   public void testFuzziness() throws Exception {
43     Directory directory = newDirectory();
44     RandomIndexWriter writer = new RandomIndexWriter(random, directory);
45     addDoc("aaaaa", writer);
46     addDoc("aaaab", writer);
47     addDoc("aaabb", writer);
48     addDoc("aabbb", writer);
49     addDoc("abbbb", writer);
50     addDoc("bbbbb", writer);
51     addDoc("ddddd", writer);
52
53     IndexReader reader = writer.getReader();
54     IndexSearcher searcher = newSearcher(reader);
55     writer.close();
56
57     FuzzyQuery query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 0);   
58     ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
59     assertEquals(3, hits.length);
60     
61     // same with prefix
62     query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 1);   
63     hits = searcher.search(query, null, 1000).scoreDocs;
64     assertEquals(3, hits.length);
65     query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 2);   
66     hits = searcher.search(query, null, 1000).scoreDocs;
67     assertEquals(3, hits.length);
68     query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 3);   
69     hits = searcher.search(query, null, 1000).scoreDocs;
70     assertEquals(3, hits.length);
71     query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 4);   
72     hits = searcher.search(query, null, 1000).scoreDocs;
73     assertEquals(2, hits.length);
74     query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 5);   
75     hits = searcher.search(query, null, 1000).scoreDocs;
76     assertEquals(1, hits.length);
77     query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 6);   
78     hits = searcher.search(query, null, 1000).scoreDocs;
79     assertEquals(1, hits.length);
80     
81     // test scoring
82     query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0);   
83     hits = searcher.search(query, null, 1000).scoreDocs;
84     assertEquals("3 documents should match", 3, hits.length);
85     List<String> order = Arrays.asList("bbbbb","abbbb","aabbb");
86     for (int i = 0; i < hits.length; i++) {
87       final String term = searcher.doc(hits[i].doc).get("field");
88       //System.out.println(hits[i].score);
89       assertEquals(order.get(i), term);
90     }
91
92     // test pq size by supplying maxExpansions=2
93     // This query would normally return 3 documents, because 3 terms match (see above):
94     query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0, 2); 
95     hits = searcher.search(query, null, 1000).scoreDocs;
96     assertEquals("only 2 documents should match", 2, hits.length);
97     order = Arrays.asList("bbbbb","abbbb");
98     for (int i = 0; i < hits.length; i++) {
99       final String term = searcher.doc(hits[i].doc).get("field");
100       //System.out.println(hits[i].score);
101       assertEquals(order.get(i), term);
102     }
103
104     // not similar enough:
105     query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0);     
106     hits = searcher.search(query, null, 1000).scoreDocs;
107     assertEquals(0, hits.length);
108     query = new FuzzyQuery(new Term("field", "aaccc"), FuzzyQuery.defaultMinSimilarity, 0);   // edit distance to "aaaaa" = 3
109     hits = searcher.search(query, null, 1000).scoreDocs;
110     assertEquals(0, hits.length);
111
112     // query identical to a word in the index:
113     query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 0);   
114     hits = searcher.search(query, null, 1000).scoreDocs;
115     assertEquals(3, hits.length);
116     assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
117     // default allows for up to two edits:
118     assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
119     assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
120
121     // query similar to a word in the index:
122     query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 0);   
123     hits = searcher.search(query, null, 1000).scoreDocs;
124     assertEquals(3, hits.length);
125     assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
126     assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
127     assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
128     
129     // now with prefix
130     query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 1);   
131     hits = searcher.search(query, null, 1000).scoreDocs;
132     assertEquals(3, hits.length);
133     assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
134     assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
135     assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
136     query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 2);   
137     hits = searcher.search(query, null, 1000).scoreDocs;
138     assertEquals(3, hits.length);
139     assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
140     assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
141     assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
142     query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 3);   
143     hits = searcher.search(query, null, 1000).scoreDocs;
144     assertEquals(3, hits.length);
145     assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
146     assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
147     assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
148     query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 4);   
149     hits = searcher.search(query, null, 1000).scoreDocs;
150     assertEquals(2, hits.length);
151     assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
152     assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
153     query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 5);   
154     hits = searcher.search(query, null, 1000).scoreDocs;
155     assertEquals(0, hits.length);
156     
157
158     query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 0);   
159     hits = searcher.search(query, null, 1000).scoreDocs;
160     assertEquals(1, hits.length);
161     assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
162     
163     // now with prefix
164     query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 1);   
165     hits = searcher.search(query, null, 1000).scoreDocs;
166     assertEquals(1, hits.length);
167     assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
168     query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 2);   
169     hits = searcher.search(query, null, 1000).scoreDocs;
170     assertEquals(1, hits.length);
171     assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
172     query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 3);   
173     hits = searcher.search(query, null, 1000).scoreDocs;
174     assertEquals(1, hits.length);
175     assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
176     query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 4);   
177     hits = searcher.search(query, null, 1000).scoreDocs;
178     assertEquals(1, hits.length);
179     assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
180     query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 5);   
181     hits = searcher.search(query, null, 1000).scoreDocs;
182     assertEquals(0, hits.length);
183     
184
185     // different field = no match:
186     query = new FuzzyQuery(new Term("anotherfield", "ddddX"), FuzzyQuery.defaultMinSimilarity, 0);   
187     hits = searcher.search(query, null, 1000).scoreDocs;
188     assertEquals(0, hits.length);
189
190     searcher.close();
191     reader.close();
192     directory.close();
193   }
194
195   public void testFuzzinessLong() throws Exception {
196     Directory directory = newDirectory();
197     RandomIndexWriter writer = new RandomIndexWriter(random, directory);
198     addDoc("aaaaaaa", writer);
199     addDoc("segment", writer);
200
201     IndexReader reader = writer.getReader();
202     IndexSearcher searcher = newSearcher(reader);
203     writer.close();
204
205     FuzzyQuery query;
206     // not similar enough:
207     query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0);   
208     ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
209     assertEquals(0, hits.length);
210     // edit distance to "aaaaaaa" = 3, this matches because the string is longer than
211     // in testDefaultFuzziness so a bigger difference is allowed:
212     query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 0);   
213     hits = searcher.search(query, null, 1000).scoreDocs;
214     assertEquals(1, hits.length);
215     assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
216     
217     // now with prefix
218     query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 1);   
219     hits = searcher.search(query, null, 1000).scoreDocs;
220     assertEquals(1, hits.length);
221     assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
222     query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 4);   
223     hits = searcher.search(query, null, 1000).scoreDocs;
224     assertEquals(1, hits.length);
225     assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
226     query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 5);   
227     hits = searcher.search(query, null, 1000).scoreDocs;
228     assertEquals(0, hits.length);
229
230     // no match, more than half of the characters is wrong:
231     query = new FuzzyQuery(new Term("field", "aaacccc"), FuzzyQuery.defaultMinSimilarity, 0);   
232     hits = searcher.search(query, null, 1000).scoreDocs;
233     assertEquals(0, hits.length);
234     
235     // now with prefix
236     query = new FuzzyQuery(new Term("field", "aaacccc"), FuzzyQuery.defaultMinSimilarity, 2);   
237     hits = searcher.search(query, null, 1000).scoreDocs;
238     assertEquals(0, hits.length);
239
240     // "student" and "stellent" are indeed similar to "segment" by default:
241     query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 0);   
242     hits = searcher.search(query, null, 1000).scoreDocs;
243     assertEquals(1, hits.length);
244     query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 0);   
245     hits = searcher.search(query, null, 1000).scoreDocs;
246     assertEquals(1, hits.length);
247     
248     // now with prefix
249     query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 1);   
250     hits = searcher.search(query, null, 1000).scoreDocs;
251     assertEquals(1, hits.length);
252     query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 1);   
253     hits = searcher.search(query, null, 1000).scoreDocs;
254     assertEquals(1, hits.length);
255     query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 2);   
256     hits = searcher.search(query, null, 1000).scoreDocs;
257     assertEquals(0, hits.length);
258     query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 2);   
259     hits = searcher.search(query, null, 1000).scoreDocs;
260     assertEquals(0, hits.length);
261     
262     // "student" doesn't match anymore thanks to increased minimum similarity:
263     query = new FuzzyQuery(new Term("field", "student"), 0.6f, 0);   
264     hits = searcher.search(query, null, 1000).scoreDocs;
265     assertEquals(0, hits.length);
266
267     try {
268       query = new FuzzyQuery(new Term("field", "student"), 1.1f);
269       fail("Expected IllegalArgumentException");
270     } catch (IllegalArgumentException e) {
271       // expecting exception
272     }
273     try {
274       query = new FuzzyQuery(new Term("field", "student"), -0.1f);
275       fail("Expected IllegalArgumentException");
276     } catch (IllegalArgumentException e) {
277       // expecting exception
278     }
279
280     searcher.close();
281     reader.close();
282     directory.close();
283   }
284
285   public void testTokenLengthOpt() throws IOException {
286     Directory directory = newDirectory();
287     RandomIndexWriter writer = new RandomIndexWriter(random, directory);
288     addDoc("12345678911", writer);
289     addDoc("segment", writer);
290
291     IndexReader reader = writer.getReader();
292     IndexSearcher searcher = newSearcher(reader);
293     writer.close();
294
295     Query query;
296     // term not over 10 chars, so optimization shortcuts
297     query = new FuzzyQuery(new Term("field", "1234569"), 0.9f);
298     ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
299     assertEquals(0, hits.length);
300
301     // 10 chars, so no optimization
302     query = new FuzzyQuery(new Term("field", "1234567891"), 0.9f);
303     hits = searcher.search(query, null, 1000).scoreDocs;
304     assertEquals(0, hits.length);
305     
306     // over 10 chars, so no optimization
307     query = new FuzzyQuery(new Term("field", "12345678911"), 0.9f);
308     hits = searcher.search(query, null, 1000).scoreDocs;
309     assertEquals(1, hits.length);
310
311     // over 10 chars, no match
312     query = new FuzzyQuery(new Term("field", "sdfsdfsdfsdf"), 0.9f);
313     hits = searcher.search(query, null, 1000).scoreDocs;
314     assertEquals(0, hits.length);
315     
316     searcher.close();
317     reader.close();
318     directory.close();
319   }
320   
321   /** Test the TopTermsBoostOnlyBooleanQueryRewrite rewrite method. */
322   public void testBoostOnlyRewrite() throws Exception {
323     Directory directory = newDirectory();
324     RandomIndexWriter writer = new RandomIndexWriter(random, directory);
325     addDoc("Lucene", writer);
326     addDoc("Lucene", writer);
327     addDoc("Lucenne", writer);
328
329     IndexReader reader = writer.getReader();
330     IndexSearcher searcher = newSearcher(reader);
331     writer.close();
332     
333     FuzzyQuery query = new FuzzyQuery(new Term("field", "Lucene"));
334     query.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(50));
335     ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
336     assertEquals(3, hits.length);
337     // normally, 'Lucenne' would be the first result as IDF will skew the score.
338     assertEquals("Lucene", reader.document(hits[0].doc).get("field"));
339     assertEquals("Lucene", reader.document(hits[1].doc).get("field"));
340     assertEquals("Lucenne", reader.document(hits[2].doc).get("field"));
341     searcher.close();
342     reader.close();
343     directory.close();
344   }
345   
346   public void testGiga() throws Exception {
347
348     MockAnalyzer analyzer = new MockAnalyzer(random);
349     Directory index = newDirectory();
350     RandomIndexWriter w = new RandomIndexWriter(random, index);
351
352     addDoc("Lucene in Action", w);
353     addDoc("Lucene for Dummies", w);
354
355     //addDoc("Giga", w);
356     addDoc("Giga byte", w);
357
358     addDoc("ManagingGigabytesManagingGigabyte", w);
359     addDoc("ManagingGigabytesManagingGigabytes", w);
360
361     addDoc("The Art of Computer Science", w);
362     addDoc("J. K. Rowling", w);
363     addDoc("JK Rowling", w);
364     addDoc("Joanne K Roling", w);
365     addDoc("Bruce Willis", w);
366     addDoc("Willis bruce", w);
367     addDoc("Brute willis", w);
368     addDoc("B. willis", w);
369     IndexReader r = w.getReader();
370     w.close();
371
372     Query q = new QueryParser(TEST_VERSION_CURRENT, "field", analyzer).parse( "giga~0.9" );
373
374     // 3. search
375     IndexSearcher searcher = newSearcher(r);
376     ScoreDoc[] hits = searcher.search(q, 10).scoreDocs;
377     assertEquals(1, hits.length);
378     assertEquals("Giga byte", searcher.doc(hits[0].doc).get("field"));
379     searcher.close();
380     r.close();
381     index.close();
382   }
383
384   private void addDoc(String text, RandomIndexWriter writer) throws IOException {
385     Document doc = new Document();
386     doc.add(newField("field", text, Field.Store.YES, Field.Index.ANALYZED));
387     writer.addDocument(doc);
388   }
389
390 }