1 package org.apache.lucene.search;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
22 import org.apache.lucene.util.LuceneTestCase;
23 import org.apache.lucene.analysis.MockAnalyzer;
24 import org.apache.lucene.analysis.MockTokenizer;
25 import org.apache.lucene.document.Document;
26 import org.apache.lucene.document.Field;
27 import org.apache.lucene.document.Field.Index;
28 import org.apache.lucene.document.Field.Store;
29 import org.apache.lucene.index.IndexReader;
30 import org.apache.lucene.index.RandomIndexWriter;
31 import org.apache.lucene.index.Term;
32 import org.apache.lucene.search.IndexSearcher;
33 import org.apache.lucene.search.PhraseQuery;
34 import org.apache.lucene.store.Directory;
36 public class TestSloppyPhraseQuery extends LuceneTestCase {
38 private static final String S_1 = "A A A";
39 private static final String S_2 = "A 1 2 3 A 4 5 6 A";
41 private static final Document DOC_1 = makeDocument("X " + S_1 + " Y");
42 private static final Document DOC_2 = makeDocument("X " + S_2 + " Y");
43 private static final Document DOC_3 = makeDocument("X " + S_1 + " A Y");
44 private static final Document DOC_1_B = makeDocument("X " + S_1 + " Y N N N N " + S_1 + " Z");
45 private static final Document DOC_2_B = makeDocument("X " + S_2 + " Y N N N N " + S_2 + " Z");
46 private static final Document DOC_3_B = makeDocument("X " + S_1 + " A Y N N N N " + S_1 + " A Y");
47 private static final Document DOC_4 = makeDocument("A A X A X B A X B B A A X B A A");
48 private static final Document DOC_5_3 = makeDocument("H H H X X X H H H X X X H H H");
49 private static final Document DOC_5_4 = makeDocument("H H H H");
51 private static final PhraseQuery QUERY_1 = makePhraseQuery( S_1 );
52 private static final PhraseQuery QUERY_2 = makePhraseQuery( S_2 );
53 private static final PhraseQuery QUERY_4 = makePhraseQuery( "X A A");
54 private static final PhraseQuery QUERY_5_4 = makePhraseQuery( "H H H H");
57 * Test DOC_4 and QUERY_4.
58 * QUERY_4 has a fuzzy (len=1) match to DOC_4, so all slop values > 0 should succeed.
59 * But only the 3rd sequence of A's in DOC_4 will do.
61 public void testDoc4_Query4_All_Slops_Should_match() throws Exception {
62 for (int slop=0; slop<30; slop++) {
63 int numResultsExpected = slop<1 ? 0 : 1;
64 checkPhraseQuery(DOC_4, QUERY_4, slop, numResultsExpected);
69 * Test DOC_1 and QUERY_1.
70 * QUERY_1 has an exact match to DOC_1, so all slop values should succeed.
71 * Before LUCENE-1310, a slop value of 1 did not succeed.
73 public void testDoc1_Query1_All_Slops_Should_match() throws Exception {
74 for (int slop=0; slop<30; slop++) {
75 float score1 = checkPhraseQuery(DOC_1, QUERY_1, slop, 1);
76 float score2 = checkPhraseQuery(DOC_1_B, QUERY_1, slop, 1);
77 assertTrue("slop="+slop+" score2="+score2+" should be greater than score1 "+score1, score2>score1);
82 * Test DOC_2 and QUERY_1.
83 * 6 should be the minimum slop to make QUERY_1 match DOC_2.
84 * Before LUCENE-1310, 7 was the minimum.
86 public void testDoc2_Query1_Slop_6_or_more_Should_match() throws Exception {
87 for (int slop=0; slop<30; slop++) {
88 int numResultsExpected = slop<6 ? 0 : 1;
89 float score1 = checkPhraseQuery(DOC_2, QUERY_1, slop, numResultsExpected);
90 if (numResultsExpected>0) {
91 float score2 = checkPhraseQuery(DOC_2_B, QUERY_1, slop, 1);
92 assertTrue("slop="+slop+" score2="+score2+" should be greater than score1 "+score1, score2>score1);
98 * Test DOC_2 and QUERY_2.
99 * QUERY_2 has an exact match to DOC_2, so all slop values should succeed.
100 * Before LUCENE-1310, 0 succeeds, 1 through 7 fail, and 8 or greater succeeds.
102 public void testDoc2_Query2_All_Slops_Should_match() throws Exception {
103 for (int slop=0; slop<30; slop++) {
104 float score1 = checkPhraseQuery(DOC_2, QUERY_2, slop, 1);
105 float score2 = checkPhraseQuery(DOC_2_B, QUERY_2, slop, 1);
106 assertTrue("slop="+slop+" score2="+score2+" should be greater than score1 "+score1, score2>score1);
111 * Test DOC_3 and QUERY_1.
112 * QUERY_1 has an exact match to DOC_3, so all slop values should succeed.
114 public void testDoc3_Query1_All_Slops_Should_match() throws Exception {
115 for (int slop=0; slop<30; slop++) {
116 float score1 = checkPhraseQuery(DOC_3, QUERY_1, slop, 1);
117 float score2 = checkPhraseQuery(DOC_3_B, QUERY_1, slop, 1);
118 assertTrue("slop="+slop+" score2="+score2+" should be greater than score1 "+score1, score2>score1);
123 public void testDoc5_Query5_Any_Slop_Should_be_consistent() throws Exception {
125 for (int slop=0; slop<3; slop++) {
126 for (int trial=0; trial<nRepeats; trial++) {
127 // should steadily always find this one
128 checkPhraseQuery(DOC_5_4, QUERY_5_4, slop, 1);
130 for (int trial=0; trial<nRepeats; trial++) {
131 // should steadily never find this one
132 checkPhraseQuery(DOC_5_3, QUERY_5_4, slop, 0);
137 private float checkPhraseQuery(Document doc, PhraseQuery query, int slop, int expectedNumResults) throws Exception {
140 Directory ramDir = newDirectory();
141 RandomIndexWriter writer = new RandomIndexWriter(random, ramDir, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
142 writer.addDocument(doc);
144 IndexReader reader = writer.getReader();
146 IndexSearcher searcher = newSearcher(reader);
147 TopDocs td = searcher.search(query,null,10);
148 //System.out.println("slop: "+slop+" query: "+query+" doc: "+doc+" Expecting number of hits: "+expectedNumResults+" maxScore="+td.getMaxScore());
149 assertEquals("slop: "+slop+" query: "+query+" doc: "+doc+" Wrong number of hits", expectedNumResults, td.totalHits);
151 //QueryUtils.check(query,searcher);
157 return td.getMaxScore();
160 private static Document makeDocument(String docText) {
161 Document doc = new Document();
162 Field f = new Field("f", docText, Field.Store.NO, Field.Index.ANALYZED);
163 f.setOmitNorms(true);
168 private static PhraseQuery makePhraseQuery(String terms) {
169 PhraseQuery query = new PhraseQuery();
170 String[] t = terms.split(" +");
171 for (int i=0; i<t.length; i++) {
172 query.add(new Term("f", t[i]));
178 /** checks that no scores or freqs are infinite */
179 private void assertSaneScoring(PhraseQuery pq, IndexSearcher searcher) throws Exception {
180 searcher.search(pq, new Collector() {
184 public void setScorer(Scorer scorer) throws IOException {
185 this.scorer = scorer;
189 public void collect(int doc) throws IOException {
190 assertFalse(Float.isInfinite(scorer.freq()));
191 assertFalse(Float.isInfinite(scorer.score()));
195 public void setNextReader(IndexReader reader, int docBase) throws IOException {
200 public boolean acceptsDocsOutOfOrder() {
205 QueryUtils.check(random, pq, searcher);
209 public void testSlopWithHoles() throws Exception {
210 Directory dir = newDirectory();
211 RandomIndexWriter iw = new RandomIndexWriter(random, dir);
212 Field f = new Field("lyrics", "", Field.Store.NO, Field.Index.ANALYZED_NO_NORMS);
213 Document doc = new Document();
215 f.setValue("drug drug");
217 f.setValue("drug druggy drug");
219 f.setValue("drug druggy druggy drug");
221 f.setValue("drug druggy drug druggy drug");
223 IndexReader ir = iw.getReader();
225 IndexSearcher is = newSearcher(ir);
227 PhraseQuery pq = new PhraseQuery();
229 pq.add(new Term("lyrics", "drug"), 1);
230 pq.add(new Term("lyrics", "drug"), 4);
232 assertEquals(0, is.search(pq, 4).totalHits);
234 assertEquals(3, is.search(pq, 4).totalHits);
236 assertEquals(4, is.search(pq, 4).totalHits);
243 public void testInfiniteFreq1() throws Exception {
244 String document = "drug druggy drug drug drug";
246 Directory dir = newDirectory();
247 RandomIndexWriter iw = new RandomIndexWriter(random, dir);
248 Document doc = new Document();
249 doc.add(newField("lyrics", document, Store.NO, Index.ANALYZED_NO_NORMS));
251 IndexReader ir = iw.getReader();
254 IndexSearcher is = newSearcher(ir);
255 PhraseQuery pq = new PhraseQuery();
257 pq.add(new Term("lyrics", "drug"), 1);
258 pq.add(new Term("lyrics", "drug"), 3);
260 assertSaneScoring(pq, is);
267 public void testInfiniteFreq2() throws Exception {
269 "So much fun to be had in my head " +
270 "No more sunshine " +
271 "So much fun just lying in my bed " +
272 "No more sunshine " +
273 "I can't face the sunlight and the dirt outside " +
274 "Wanna stay in 666 where this darkness don't lie " +
275 "Drug drug druggy " +
276 "Got a feeling sweet like honey " +
277 "Drug drug druggy " +
278 "Need sensation like my baby " +
279 "Show me your scars you're so aware " +
280 "I'm not barbaric I just care " +
282 "I need a reflection to prove I exist " +
283 "No more sunshine " +
284 "I am a victim of designer blitz " +
285 "No more sunshine " +
286 "Dance like a robot when you're chained at the knee " +
287 "The C.I.A say you're all they'll ever need " +
288 "Drug drug druggy " +
289 "Got a feeling sweet like honey " +
290 "Drug drug druggy " +
291 "Need sensation like my baby " +
292 "Snort your lines you're so aware " +
293 "I'm not barbaric I just care " +
294 "Drug drug druggy " +
295 "Got a feeling sweet like honey " +
296 "Drug drug druggy " +
297 "Need sensation like my baby";
299 Directory dir = newDirectory();
301 RandomIndexWriter iw = new RandomIndexWriter(random, dir);
302 Document doc = new Document();
303 doc.add(newField("lyrics", document, Store.NO, Index.ANALYZED_NO_NORMS));
305 IndexReader ir = iw.getReader();
308 IndexSearcher is = newSearcher(ir);
310 PhraseQuery pq = new PhraseQuery();
312 pq.add(new Term("lyrics", "drug"), 1);
313 pq.add(new Term("lyrics", "drug"), 3);
315 assertSaneScoring(pq, is);