lucene-java-3.4.0/lucene/backwards/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java

   1 package org.apache.lucene.search;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import org.apache.lucene.index.IndexWriterConfig;
  21 import org.apache.lucene.index.RandomIndexWriter;
  22 import org.apache.lucene.index.Term;
  23 import org.apache.lucene.index.TermEnum;
  24 import org.apache.lucene.index.IndexReader;
  25 import org.apache.lucene.queryParser.ParseException;
  26 import org.apache.lucene.queryParser.QueryParser;
  27 import org.apache.lucene.search.Explanation.IDFExplanation;
  28 import org.apache.lucene.store.Directory;
  29 import org.apache.lucene.analysis.Analyzer;
  30 import org.apache.lucene.analysis.SimpleAnalyzer;
  31 import org.apache.lucene.analysis.TokenStream;
  32 import org.apache.lucene.analysis.Tokenizer;
  33 import org.apache.lucene.analysis.standard.StandardAnalyzer;
  34 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
  35 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
  36 import org.apache.lucene.document.Document;
  37 import org.apache.lucene.document.Field;
  38 import org.apache.lucene.index.IndexWriter;
  39 import org.apache.lucene.search.IndexSearcher;
  40 import org.apache.lucene.store.RAMDirectory;
  41 import org.apache.lucene.util.LuceneTestCase;
  42
  43 import java.io.IOException;
  44 import java.util.Collection;
  45 import java.util.LinkedList;
  46 import java.io.Reader;
  47
  48 /**
  49  * This class tests the MultiPhraseQuery class.
  50  *
  51  *
  52  */
  53 public class TestMultiPhraseQuery extends LuceneTestCase {
  54
  55   public void testPhrasePrefix() throws IOException {
  56     Directory indexStore = newDirectory();
  57     RandomIndexWriter writer = new RandomIndexWriter(random, indexStore);
  58     add("blueberry pie", writer);
  59     add("blueberry strudel", writer);
  60     add("blueberry pizza", writer);
  61     add("blueberry chewing gum", writer);
  62     add("bluebird pizza", writer);
  63     add("bluebird foobar pizza", writer);
  64     add("piccadilly circus", writer);
  65
  66     IndexReader reader = writer.getReader();
  67     IndexSearcher searcher = newSearcher(reader);
  68
  69     // search for "blueberry pi*":
  70     MultiPhraseQuery query1 = new MultiPhraseQuery();
  71     // search for "strawberry pi*":
  72     MultiPhraseQuery query2 = new MultiPhraseQuery();
  73     query1.add(new Term("body", "blueberry"));
  74     query2.add(new Term("body", "strawberry"));
  75
  76     LinkedList<Term> termsWithPrefix = new LinkedList<Term>();
  77     IndexReader ir = reader;
  78
  79     // this TermEnum gives "piccadilly", "pie" and "pizza".
  80     String prefix = "pi";
  81     TermEnum te = ir.terms(new Term("body", prefix));
  82     do {
  83         if (te.term().text().startsWith(prefix))
  84         {
  85             termsWithPrefix.add(te.term());
  86         }
  87     } while (te.next());
  88
  89     query1.add(termsWithPrefix.toArray(new Term[0]));
  90     assertEquals("body:\"blueberry (piccadilly pie pizza)\"", query1.toString());
  91     query2.add(termsWithPrefix.toArray(new Term[0]));
  92     assertEquals("body:\"strawberry (piccadilly pie pizza)\"", query2.toString());
  93
  94     ScoreDoc[] result;
  95     result = searcher.search(query1, null, 1000).scoreDocs;
  96     assertEquals(2, result.length);
  97     result = searcher.search(query2, null, 1000).scoreDocs;
  98     assertEquals(0, result.length);
  99
 100     // search for "blue* pizza":
 101     MultiPhraseQuery query3 = new MultiPhraseQuery();
 102     termsWithPrefix.clear();
 103     prefix = "blue";
 104     te = ir.terms(new Term("body", prefix));
 105     do {
 106         if (te.term().text().startsWith(prefix))
 107         {
 108             termsWithPrefix.add(te.term());
 109         }
 110     } while (te.next());
 111     query3.add(termsWithPrefix.toArray(new Term[0]));
 112     query3.add(new Term("body", "pizza"));
 113
 114     result = searcher.search(query3, null, 1000).scoreDocs;
 115     assertEquals(2, result.length); // blueberry pizza, bluebird pizza
 116     assertEquals("body:\"(blueberry bluebird) pizza\"", query3.toString());
 117
 118     // test slop:
 119     query3.setSlop(1);
 120     result = searcher.search(query3, null, 1000).scoreDocs;
 121
 122     // just make sure no exc:
 123     searcher.explain(query3, 0);
 124
 125     assertEquals(3, result.length); // blueberry pizza, bluebird pizza, bluebird foobar pizza
 126
 127     MultiPhraseQuery query4 = new MultiPhraseQuery();
 128     try {
 129       query4.add(new Term("field1", "foo"));
 130       query4.add(new Term("field2", "foobar"));
 131       fail();
 132     } catch(IllegalArgumentException e) {
 133       // okay, all terms must belong to the same field
 134     }
 135
 136     writer.close();
 137     searcher.close();
 138     reader.close();
 139     indexStore.close();
 140   }
 141
 142   // LUCENE-2580
 143   public void testTall() throws IOException {
 144     Directory indexStore = newDirectory();
 145     RandomIndexWriter writer = new RandomIndexWriter(random, indexStore);
 146     add("blueberry chocolate pie", writer);
 147     add("blueberry chocolate tart", writer);
 148     IndexReader r = writer.getReader();
 149     writer.close();
 150
 151     IndexSearcher searcher = newSearcher(r);
 152     MultiPhraseQuery q = new MultiPhraseQuery();
 153     q.add(new Term("body", "blueberry"));
 154     q.add(new Term("body", "chocolate"));
 155     q.add(new Term[] {new Term("body", "pie"), new Term("body", "tart")});
 156     assertEquals(2, searcher.search(q, 1).totalHits);
 157     searcher.close();
 158     r.close();
 159     indexStore.close();
 160   }
 161
 162   private void add(String s, RandomIndexWriter writer) throws IOException {
 163     Document doc = new Document();
 164     doc.add(newField("body", s, Field.Store.YES, Field.Index.ANALYZED));
 165     writer.addDocument(doc);
 166   }
 167
 168   public void testBooleanQueryContainingSingleTermPrefixQuery()
 169       throws IOException {
 170     // this tests against bug 33161 (now fixed)
 171     // In order to cause the bug, the outer query must have more than one term
 172     // and all terms required.
 173     // The contained PhraseMultiQuery must contain exactly one term array.
 174     Directory indexStore = newDirectory();
 175     RandomIndexWriter writer = new RandomIndexWriter(random, indexStore);
 176     add("blueberry pie", writer);
 177     add("blueberry chewing gum", writer);
 178     add("blue raspberry pie", writer);
 179
 180     IndexReader reader = writer.getReader();
 181     IndexSearcher searcher = newSearcher(reader);
 182     // This query will be equivalent to +body:pie +body:"blue*"
 183     BooleanQuery q = new BooleanQuery();
 184     q.add(new TermQuery(new Term("body", "pie")), BooleanClause.Occur.MUST);
 185
 186     MultiPhraseQuery trouble = new MultiPhraseQuery();
 187     trouble.add(new Term[] {new Term("body", "blueberry"),
 188         new Term("body", "blue")});
 189     q.add(trouble, BooleanClause.Occur.MUST);
 190
 191     // exception will be thrown here without fix
 192     ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs;
 193
 194     assertEquals("Wrong number of hits", 2, hits.length);
 195
 196     // just make sure no exc:
 197     searcher.explain(q, 0);
 198
 199     writer.close();
 200     searcher.close();
 201     reader.close();
 202     indexStore.close();
 203   }
 204
 205   public void testPhrasePrefixWithBooleanQuery() throws IOException {
 206     Directory indexStore = newDirectory();
 207     RandomIndexWriter writer = new RandomIndexWriter(random, indexStore);
 208     add("This is a test", "object", writer);
 209     add("a note", "note", writer);
 210
 211     IndexReader reader = writer.getReader();
 212     IndexSearcher searcher = newSearcher(reader);
 213
 214     // This query will be equivalent to +type:note +body:"a t*"
 215     BooleanQuery q = new BooleanQuery();
 216     q.add(new TermQuery(new Term("type", "note")), BooleanClause.Occur.MUST);
 217
 218     MultiPhraseQuery trouble = new MultiPhraseQuery();
 219     trouble.add(new Term("body", "a"));
 220     trouble
 221         .add(new Term[] {new Term("body", "test"), new Term("body", "this")});
 222     q.add(trouble, BooleanClause.Occur.MUST);
 223
 224     // exception will be thrown here without fix for #35626:
 225     ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs;
 226     assertEquals("Wrong number of hits", 0, hits.length);
 227     writer.close();
 228     searcher.close();
 229     reader.close();
 230     indexStore.close();
 231   }
 232
 233   public void testNoDocs() throws Exception {
 234     Directory indexStore = newDirectory();
 235     RandomIndexWriter writer = new RandomIndexWriter(random, indexStore);
 236     add("a note", "note", writer);
 237
 238     IndexReader reader = writer.getReader();
 239     IndexSearcher searcher = newSearcher(reader);
 240
 241     MultiPhraseQuery q = new MultiPhraseQuery();
 242     q.add(new Term("body", "a"));
 243     q.add(new Term[] {new Term("body", "nope"), new Term("body", "nope")});
 244     assertEquals("Wrong number of hits", 0,
 245         searcher.search(q, null, 1).totalHits);
 246
 247     // just make sure no exc:
 248     searcher.explain(q, 0);
 249
 250     writer.close();
 251     searcher.close();
 252     reader.close();
 253     indexStore.close();
 254   }
 255
 256   public void testHashCodeAndEquals() {
 257     MultiPhraseQuery query1 = new MultiPhraseQuery();
 258     MultiPhraseQuery query2 = new MultiPhraseQuery();
 259
 260     assertEquals(query1.hashCode(), query2.hashCode());
 261     assertEquals(query1, query2);
 262
 263     Term term1 = new Term("someField", "someText");
 264
 265     query1.add(term1);
 266     query2.add(term1);
 267
 268     assertEquals(query1.hashCode(), query2.hashCode());
 269     assertEquals(query1, query2);
 270
 271     Term term2 = new Term("someField", "someMoreText");
 272
 273     query1.add(term2);
 274
 275     assertFalse(query1.hashCode() == query2.hashCode());
 276     assertFalse(query1.equals(query2));
 277
 278     query2.add(term2);
 279
 280     assertEquals(query1.hashCode(), query2.hashCode());
 281     assertEquals(query1, query2);
 282   }
 283
 284   private void add(String s, String type, RandomIndexWriter writer)
 285       throws IOException {
 286     Document doc = new Document();
 287     doc.add(newField("body", s, Field.Store.YES, Field.Index.ANALYZED));
 288     doc.add(newField("type", type, Field.Store.YES, Field.Index.NOT_ANALYZED));
 289     writer.addDocument(doc);
 290   }
 291
 292   // LUCENE-2526
 293   public void testEmptyToString() {
 294     new MultiPhraseQuery().toString();
 295   }
 296
 297   public void testCustomIDF() throws Exception {
 298     Directory indexStore = newDirectory();
 299     RandomIndexWriter writer = new RandomIndexWriter(random, indexStore);
 300     add("This is a test", "object", writer);
 301     add("a note", "note", writer);
 302
 303     IndexReader reader = writer.getReader();
 304     IndexSearcher searcher = newSearcher(reader);
 305     searcher.setSimilarity(new DefaultSimilarity() {
 306
 307       @Override
 308       public IDFExplanation idfExplain(Collection<Term> terms,
 309           Searcher searcher) throws IOException {
 310         return new IDFExplanation() {
 311
 312           @Override
 313           public float getIdf() {
 314             return 10f;
 315           }
 316
 317           @Override
 318           public String explain() {
 319             return "just a test";
 320           }
 321
 322         };
 323       }
 324     });
 325
 326     MultiPhraseQuery query = new MultiPhraseQuery();
 327     query.add(new Term[] { new Term("body", "this"), new Term("body", "that") });
 328     query.add(new Term("body", "is"));
 329     Weight weight = query.createWeight(searcher);
 330     assertEquals(10f * 10f, weight.sumOfSquaredWeights(), 0.001f);
 331
 332     writer.close();
 333     searcher.close();
 334     reader.close();
 335     indexStore.close();
 336   }
 337
 338   private static class TokenAndPos {
 339     public final String token;
 340     public final int pos;
 341     public TokenAndPos(String token, int pos) {
 342       this.token = token;
 343       this.pos = pos;
 344     }
 345   }
 346
 347   private static class CannedAnalyzer extends Analyzer {
 348     private final TokenAndPos[] tokens;
 349
 350     public CannedAnalyzer(TokenAndPos[] tokens) {
 351       this.tokens = tokens;
 352     }
 353
 354     @Override
 355     public TokenStream tokenStream(String fieldName, Reader reader) {
 356       return new CannedTokenizer(tokens);
 357     }
 358   }
 359
 360   private static class CannedTokenizer extends Tokenizer {
 361     private final TokenAndPos[] tokens;
 362     private int upto = 0;
 363     private int lastPos = 0;
 364     private final TermAttribute termAtt = addAttribute(TermAttribute.class);
 365     private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
 366
 367     public CannedTokenizer(TokenAndPos[] tokens) {
 368       this.tokens = tokens;
 369     }
 370
 371     @Override
 372     public final boolean incrementToken() throws IOException {
 373       clearAttributes();
 374       if (upto < tokens.length) {
 375         final TokenAndPos token = tokens[upto++];
 376         termAtt.setTermBuffer(token.token);
 377         posIncrAtt.setPositionIncrement(token.pos - lastPos);
 378         lastPos = token.pos;
 379         return true;
 380       } else {
 381         return false;
 382       }
 383     }
 384   }
 385
 386   public void testZeroPosIncr() throws IOException {
 387     Directory dir = new RAMDirectory();
 388     final TokenAndPos[] tokens = new TokenAndPos[3];
 389     tokens[0] = new TokenAndPos("a", 0);
 390     tokens[1] = new TokenAndPos("b", 0);
 391     tokens[2] = new TokenAndPos("c", 0);
 392
 393     IndexWriter writer = new IndexWriter(dir, new CannedAnalyzer(tokens), true, IndexWriter.MaxFieldLength.LIMITED);
 394     Document doc = new Document();
 395     doc.add(new Field("field", "", Field.Store.NO, Field.Index.ANALYZED));
 396     writer.addDocument(doc);
 397     writer.addDocument(doc);
 398     IndexReader r = writer.getReader();
 399     writer.close();
 400     IndexSearcher s = new IndexSearcher(r);
 401     MultiPhraseQuery mpq = new MultiPhraseQuery();
 402     //mpq.setSlop(1);
 403
 404     // NOTE: not great that if we do the else clause here we
 405     // get different scores!  MultiPhraseQuery counts that
 406     // phrase as occurring twice per doc (it should be 1, I
 407     // think?).  This is because MultipleTermPositions is able to
 408     // return the same position more than once (0, in this
 409     // case):
 410     if (true) {
 411       mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
 412       mpq.add(new Term[] {new Term("field", "a")}, 0);
 413     } else {
 414       mpq.add(new Term[] {new Term("field", "a")}, 0);
 415       mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
 416     }
 417     TopDocs hits = s.search(mpq, 2);
 418     assertEquals(2, hits.totalHits);
 419     assertEquals(hits.scoreDocs[0].score, hits.scoreDocs[1].score, 1e-5);
 420     /*
 421     for(int hit=0;hit<hits.totalHits;hit++) {
 422       ScoreDoc sd = hits.scoreDocs[hit];
 423       System.out.println("  hit doc=" + sd.doc + " score=" + sd.score);
 424     }
 425     */
 426     r.close();
 427     dir.close();
 428   }
 429
 430   private final static TokenAndPos[] INCR_0_DOC_TOKENS = new TokenAndPos[] {
 431       new TokenAndPos("x", 0),
 432       new TokenAndPos("a", 1),
 433       new TokenAndPos("1", 1),
 434       new TokenAndPos("m", 2), // not existing, relying on slop=2
 435       new TokenAndPos("b", 3),
 436       new TokenAndPos("1", 3),
 437       new TokenAndPos("n", 4), // not existing, relying on slop=2
 438       new TokenAndPos("c", 5),
 439       new TokenAndPos("y", 6)
 440   };
 441
 442   private final static TokenAndPos[] INCR_0_QUERY_TOKENS_AND = new TokenAndPos[] {
 443       new TokenAndPos("a", 0),
 444       new TokenAndPos("1", 0),
 445       new TokenAndPos("b", 1),
 446       new TokenAndPos("1", 1),
 447       new TokenAndPos("c", 2)
 448   };
 449
 450   private final static TokenAndPos[][] INCR_0_QUERY_TOKENS_AND_OR_MATCH = new TokenAndPos[][] {
 451       { new TokenAndPos("a", 0) },
 452       { new TokenAndPos("x", 0), new TokenAndPos("1", 0) },
 453       { new TokenAndPos("b", 1) },
 454       { new TokenAndPos("x", 1), new TokenAndPos("1", 1) },
 455       { new TokenAndPos("c", 2) }
 456   };
 457
 458   private final static TokenAndPos[][] INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN = new TokenAndPos[][] {
 459       { new TokenAndPos("x", 0) },
 460       { new TokenAndPos("a", 0), new TokenAndPos("1", 0) },
 461       { new TokenAndPos("x", 1) },
 462       { new TokenAndPos("b", 1), new TokenAndPos("1", 1) },
 463       { new TokenAndPos("c", 2) }
 464   };
 465
 466   /**
 467    * using query parser, MPQ will be created, and will not be strict about having all query terms
 468    * in each position - one of each position is sufficient (OR logic)
 469    */
 470   public void testZeroPosIncrSloppyParsedAnd() throws IOException, ParseException {
 471     QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new CannedAnalyzer(INCR_0_QUERY_TOKENS_AND));
 472     final Query q = qp.parse("\"this text is acually ignored\"");
 473     assertTrue("wrong query type!", q instanceof MultiPhraseQuery);
 474     doTestZeroPosIncrSloppy(q, 0);
 475     ((MultiPhraseQuery) q).setSlop(1);
 476     doTestZeroPosIncrSloppy(q, 0);
 477     ((MultiPhraseQuery) q).setSlop(2);
 478     doTestZeroPosIncrSloppy(q, 1);
 479   }
 480
 481   private void doTestZeroPosIncrSloppy(Query q, int nExpected) throws IOException {
 482     Directory dir = newDirectory(); // random dir
 483     IndexWriterConfig cfg = newIndexWriterConfig(TEST_VERSION_CURRENT, new CannedAnalyzer(INCR_0_DOC_TOKENS));
 484     IndexWriter writer = new IndexWriter(dir, cfg);
 485     Document doc = new Document();
 486     doc.add(new Field("field", "", Field.Store.NO, Field.Index.ANALYZED));
 487     writer.addDocument(doc);
 488     IndexReader r = IndexReader.open(writer,false);
 489     writer.close();
 490     IndexSearcher s = new IndexSearcher(r);
 491
 492     if (VERBOSE) {
 493       System.out.println("QUERY=" + q);
 494     }
 495
 496     TopDocs hits = s.search(q, 1);
 497     assertEquals("wrong number of results", nExpected, hits.totalHits);
 498
 499     if (VERBOSE) {
 500       for(int hit=0;hit<hits.totalHits;hit++) {
 501         ScoreDoc sd = hits.scoreDocs[hit];
 502         System.out.println("  hit doc=" + sd.doc + " score=" + sd.score);
 503       }
 504     }
 505
 506     r.close();
 507     dir.close();
 508   }
 509
 510   /**
 511    * PQ AND Mode - Manually creating a phrase query
 512    */
 513   public void testZeroPosIncrSloppyPqAnd() throws IOException, ParseException {
 514     final PhraseQuery pq = new PhraseQuery();
 515     for (TokenAndPos tap : INCR_0_QUERY_TOKENS_AND) {
 516       pq.add(new Term("field",tap.token), tap.pos);
 517     }
 518     doTestZeroPosIncrSloppy(pq, 0);
 519     pq.setSlop(1);
 520     doTestZeroPosIncrSloppy(pq, 0);
 521     pq.setSlop(2);
 522     doTestZeroPosIncrSloppy(pq, 1);
 523   }
 524
 525   /**
 526    * MPQ AND Mode - Manually creating a multiple phrase query
 527    */
 528   public void testZeroPosIncrSloppyMpqAnd() throws IOException, ParseException {
 529     final MultiPhraseQuery mpq = new MultiPhraseQuery();
 530     for (TokenAndPos tap : INCR_0_QUERY_TOKENS_AND) {
 531       mpq.add(new Term[]{new Term("field",tap.token)}, tap.pos); //AND logic
 532     }
 533     doTestZeroPosIncrSloppy(mpq, 0);
 534     mpq.setSlop(1);
 535     doTestZeroPosIncrSloppy(mpq, 0);
 536     mpq.setSlop(2);
 537     doTestZeroPosIncrSloppy(mpq, 1);
 538   }
 539
 540   /**
 541    * MPQ Combined AND OR Mode - Manually creating a multiple phrase query
 542    */
 543   public void testZeroPosIncrSloppyMpqAndOrMatch() throws IOException, ParseException {
 544     final MultiPhraseQuery mpq = new MultiPhraseQuery();
 545     for (TokenAndPos tap[] : INCR_0_QUERY_TOKENS_AND_OR_MATCH) {
 546       Term[] terms = tapTerms(tap);
 547       final int pos = tap[0].pos;
 548       mpq.add(terms, pos); //AND logic in pos, OR across lines
 549     }
 550     doTestZeroPosIncrSloppy(mpq, 0);
 551     mpq.setSlop(1);
 552     doTestZeroPosIncrSloppy(mpq, 0);
 553     mpq.setSlop(2);
 554     doTestZeroPosIncrSloppy(mpq, 1);
 555   }
 556
 557   /**
 558    * MPQ Combined AND OR Mode - Manually creating a multiple phrase query - with no match
 559    */
 560   public void testZeroPosIncrSloppyMpqAndOrNoMatch() throws IOException, ParseException {
 561     final MultiPhraseQuery mpq = new MultiPhraseQuery();
 562     for (TokenAndPos tap[] : INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN) {
 563       Term[] terms = tapTerms(tap);
 564       final int pos = tap[0].pos;
 565       mpq.add(terms, pos); //AND logic in pos, OR across lines
 566     }
 567     doTestZeroPosIncrSloppy(mpq, 0);
 568     mpq.setSlop(2);
 569     doTestZeroPosIncrSloppy(mpq, 0);
 570   }
 571
 572   private Term[] tapTerms(TokenAndPos[] tap) {
 573     Term[] terms = new Term[tap.length];
 574     for (int i=0; i<terms.length; i++) {
 575       terms[i] = new Term("field",tap[i].token);
 576     }
 577     return terms;
 578   }
 579
 580 }
 581