pylucene 3.5.0-3

[pylucene.git] / lucene-java-3.5.0 / lucene / backwards / src / test / org / apache / lucene / search / TestMultiPhraseQuery.java
diff --git a/lucene-java-3.5.0/lucene/backwards/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java b/lucene-java-3.5.0/lucene/backwards/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java

new file mode 100644 (file)

index 0000000..892c7e7
--- /dev/null
+++ b/lucene-java-3.5.0/lucene/backwards/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
@@ -0,0 +1,588 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.queryParser.ParseException;
+import org.apache.lucene.queryParser.QueryParser;
+import org.apache.lucene.search.Explanation.IDFExplanation;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.LuceneTestCase;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.LinkedList;
+import java.io.Reader;
+
+/**
+ * This class tests the MultiPhraseQuery class.
+ * 
+ * 
+ */
+public class TestMultiPhraseQuery extends LuceneTestCase {
+  
+  public void testPhrasePrefix() throws IOException {
+    Directory indexStore = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random, indexStore);
+    add("blueberry pie", writer);
+    add("blueberry strudel", writer);
+    add("blueberry pizza", writer);
+    add("blueberry chewing gum", writer);
+    add("bluebird pizza", writer);
+    add("bluebird foobar pizza", writer);
+    add("piccadilly circus", writer);
+    
+    IndexReader reader = writer.getReader();
+    IndexSearcher searcher = newSearcher(reader);
+    
+    // search for "blueberry pi*":
+    MultiPhraseQuery query1 = new MultiPhraseQuery();
+    // search for "strawberry pi*":
+    MultiPhraseQuery query2 = new MultiPhraseQuery();
+    query1.add(new Term("body", "blueberry"));
+    query2.add(new Term("body", "strawberry"));
+
+    LinkedList<Term> termsWithPrefix = new LinkedList<Term>();
+    IndexReader ir = reader;
+
+    // this TermEnum gives "piccadilly", "pie" and "pizza".
+    String prefix = "pi";
+    TermEnum te = ir.terms(new Term("body", prefix));
+    do {
+        if (te.term().text().startsWith(prefix))
+        {
+            termsWithPrefix.add(te.term());
+        }
+    } while (te.next());
+
+    query1.add(termsWithPrefix.toArray(new Term[0]));
+    assertEquals("body:\"blueberry (piccadilly pie pizza)\"", query1.toString());
+    query2.add(termsWithPrefix.toArray(new Term[0]));
+    assertEquals("body:\"strawberry (piccadilly pie pizza)\"", query2.toString());
+
+    ScoreDoc[] result;
+    result = searcher.search(query1, null, 1000).scoreDocs;
+    assertEquals(2, result.length);
+    result = searcher.search(query2, null, 1000).scoreDocs;
+    assertEquals(0, result.length);
+
+    // search for "blue* pizza":
+    MultiPhraseQuery query3 = new MultiPhraseQuery();
+    termsWithPrefix.clear();
+    prefix = "blue";
+    te = ir.terms(new Term("body", prefix));
+    do {
+        if (te.term().text().startsWith(prefix))
+        {
+            termsWithPrefix.add(te.term());
+        }
+    } while (te.next());
+    query3.add(termsWithPrefix.toArray(new Term[0]));
+    query3.add(new Term("body", "pizza"));
+
+    result = searcher.search(query3, null, 1000).scoreDocs;
+    assertEquals(2, result.length); // blueberry pizza, bluebird pizza
+    assertEquals("body:\"(blueberry bluebird) pizza\"", query3.toString());
+
+    // test slop:
+    query3.setSlop(1);
+    result = searcher.search(query3, null, 1000).scoreDocs;
+
+    // just make sure no exc:
+    searcher.explain(query3, 0);
+
+    assertEquals(3, result.length); // blueberry pizza, bluebird pizza, bluebird foobar pizza
+
+    MultiPhraseQuery query4 = new MultiPhraseQuery();
+    try {
+      query4.add(new Term("field1", "foo"));
+      query4.add(new Term("field2", "foobar"));
+      fail();
+    } catch(IllegalArgumentException e) {
+      // okay, all terms must belong to the same field
+    }
+    
+    writer.close();
+    searcher.close();
+    reader.close();
+    indexStore.close();
+  }
+
+  // LUCENE-2580
+  public void testTall() throws IOException {
+    Directory indexStore = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random, indexStore);
+    add("blueberry chocolate pie", writer);
+    add("blueberry chocolate tart", writer);
+    IndexReader r = writer.getReader();
+    writer.close();
+
+    IndexSearcher searcher = newSearcher(r);
+    MultiPhraseQuery q = new MultiPhraseQuery();
+    q.add(new Term("body", "blueberry"));
+    q.add(new Term("body", "chocolate"));
+    q.add(new Term[] {new Term("body", "pie"), new Term("body", "tart")});
+    assertEquals(2, searcher.search(q, 1).totalHits);
+    searcher.close();
+    r.close();
+    indexStore.close();
+  }
+  
+  private void add(String s, RandomIndexWriter writer) throws IOException {
+    Document doc = new Document();
+    doc.add(newField("body", s, Field.Store.YES, Field.Index.ANALYZED));
+    writer.addDocument(doc);
+  }
+  
+  public void testBooleanQueryContainingSingleTermPrefixQuery()
+      throws IOException {
+    // this tests against bug 33161 (now fixed)
+    // In order to cause the bug, the outer query must have more than one term
+    // and all terms required.
+    // The contained PhraseMultiQuery must contain exactly one term array.
+    Directory indexStore = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random, indexStore);
+    add("blueberry pie", writer);
+    add("blueberry chewing gum", writer);
+    add("blue raspberry pie", writer);
+    
+    IndexReader reader = writer.getReader();
+    IndexSearcher searcher = newSearcher(reader);
+    // This query will be equivalent to +body:pie +body:"blue*"
+    BooleanQuery q = new BooleanQuery();
+    q.add(new TermQuery(new Term("body", "pie")), BooleanClause.Occur.MUST);
+    
+    MultiPhraseQuery trouble = new MultiPhraseQuery();
+    trouble.add(new Term[] {new Term("body", "blueberry"),
+        new Term("body", "blue")});
+    q.add(trouble, BooleanClause.Occur.MUST);
+    
+    // exception will be thrown here without fix
+    ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs;
+    
+    assertEquals("Wrong number of hits", 2, hits.length);
+    
+    // just make sure no exc:
+    searcher.explain(q, 0);
+    
+    writer.close();
+    searcher.close();
+    reader.close();
+    indexStore.close();
+  }
+  
+  public void testPhrasePrefixWithBooleanQuery() throws IOException {
+    Directory indexStore = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random, indexStore);
+    add("This is a test", "object", writer);
+    add("a note", "note", writer);
+    
+    IndexReader reader = writer.getReader();
+    IndexSearcher searcher = newSearcher(reader);
+    
+    // This query will be equivalent to +type:note +body:"a t*"
+    BooleanQuery q = new BooleanQuery();
+    q.add(new TermQuery(new Term("type", "note")), BooleanClause.Occur.MUST);
+    
+    MultiPhraseQuery trouble = new MultiPhraseQuery();
+    trouble.add(new Term("body", "a"));
+    trouble
+        .add(new Term[] {new Term("body", "test"), new Term("body", "this")});
+    q.add(trouble, BooleanClause.Occur.MUST);
+    
+    // exception will be thrown here without fix for #35626:
+    ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs;
+    assertEquals("Wrong number of hits", 0, hits.length);
+    writer.close();
+    searcher.close();
+    reader.close();
+    indexStore.close();
+  }
+  
+  public void testNoDocs() throws Exception {
+    Directory indexStore = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random, indexStore);
+    add("a note", "note", writer);
+    
+    IndexReader reader = writer.getReader();
+    IndexSearcher searcher = newSearcher(reader);
+    
+    MultiPhraseQuery q = new MultiPhraseQuery();
+    q.add(new Term("body", "a"));
+    q.add(new Term[] {new Term("body", "nope"), new Term("body", "nope")});
+    assertEquals("Wrong number of hits", 0,
+        searcher.search(q, null, 1).totalHits);
+    
+    // just make sure no exc:
+    searcher.explain(q, 0);
+    
+    writer.close();
+    searcher.close();
+    reader.close();
+    indexStore.close();
+  }
+  
+  public void testHashCodeAndEquals() {
+    MultiPhraseQuery query1 = new MultiPhraseQuery();
+    MultiPhraseQuery query2 = new MultiPhraseQuery();
+    
+    assertEquals(query1.hashCode(), query2.hashCode());
+    assertEquals(query1, query2);
+    
+    Term term1 = new Term("someField", "someText");
+    
+    query1.add(term1);
+    query2.add(term1);
+    
+    assertEquals(query1.hashCode(), query2.hashCode());
+    assertEquals(query1, query2);
+    
+    Term term2 = new Term("someField", "someMoreText");
+    
+    query1.add(term2);
+    
+    assertFalse(query1.hashCode() == query2.hashCode());
+    assertFalse(query1.equals(query2));
+    
+    query2.add(term2);
+    
+    assertEquals(query1.hashCode(), query2.hashCode());
+    assertEquals(query1, query2);
+  }
+  
+  private void add(String s, String type, RandomIndexWriter writer)
+      throws IOException {
+    Document doc = new Document();
+    doc.add(newField("body", s, Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(newField("type", type, Field.Store.YES, Field.Index.NOT_ANALYZED));
+    writer.addDocument(doc);
+  }
+  
+  // LUCENE-2526
+  public void testEmptyToString() {
+    new MultiPhraseQuery().toString();
+  }
+  
+  public void testCustomIDF() throws Exception {
+    Directory indexStore = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random, indexStore);
+    add("This is a test", "object", writer);
+    add("a note", "note", writer);
+    
+    IndexReader reader = writer.getReader();
+    IndexSearcher searcher = newSearcher(reader);
+    searcher.setSimilarity(new DefaultSimilarity() {
+      
+      @Override
+      public IDFExplanation idfExplain(Collection<Term> terms,
+          Searcher searcher) throws IOException {
+        return new IDFExplanation() {
+
+          @Override
+          public float getIdf() {
+            return 10f;
+          }
+
+          @Override
+          public String explain() {
+            return "just a test";
+          }
+          
+        };
+      }   
+    });
+    
+    MultiPhraseQuery query = new MultiPhraseQuery();
+    query.add(new Term[] { new Term("body", "this"), new Term("body", "that") });
+    query.add(new Term("body", "is"));
+    Weight weight = query.createWeight(searcher);
+    assertEquals(10f * 10f, weight.sumOfSquaredWeights(), 0.001f);
+
+    writer.close();
+    searcher.close();
+    reader.close();
+    indexStore.close();
+  }
+
+  private static class TokenAndPos {
+    public final String token;
+    public final int pos;
+    public TokenAndPos(String token, int pos) {
+      this.token = token;
+      this.pos = pos;
+    }
+  }
+
+  private static class CannedAnalyzer extends Analyzer {
+    private final TokenAndPos[] tokens;
+    
+    public CannedAnalyzer(TokenAndPos[] tokens) {
+      this.tokens = tokens;
+    }
+
+    @Override
+    public TokenStream tokenStream(String fieldName, Reader reader) {
+      return new CannedTokenizer(tokens);
+    }
+  }
+
+  private static class CannedTokenizer extends Tokenizer {
+    private final TokenAndPos[] tokens;
+    private int upto = 0;
+    private int lastPos = 0;
+    private final TermAttribute termAtt = addAttribute(TermAttribute.class);
+    private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+
+    public CannedTokenizer(TokenAndPos[] tokens) {
+      this.tokens = tokens;
+    }
+
+    @Override
+    public final boolean incrementToken() throws IOException {
+      clearAttributes();      
+      if (upto < tokens.length) {
+        final TokenAndPos token = tokens[upto++];
+        termAtt.setTermBuffer(token.token);
+        posIncrAtt.setPositionIncrement(token.pos - lastPos);
+        lastPos = token.pos;
+        return true;
+      } else {
+        return false;
+      }
+    }
+
+    @Override
+    public void reset() throws IOException {
+      super.reset();
+      this.upto = 0;
+      this.lastPos = 0;
+    }
+  }
+
+  public void testZeroPosIncr() throws IOException {
+    Directory dir = new RAMDirectory();
+    final TokenAndPos[] tokens = new TokenAndPos[3];
+    tokens[0] = new TokenAndPos("a", 0);
+    tokens[1] = new TokenAndPos("b", 0);
+    tokens[2] = new TokenAndPos("c", 0);
+
+    IndexWriter writer = new IndexWriter(dir, new CannedAnalyzer(tokens), true, IndexWriter.MaxFieldLength.LIMITED);
+    Document doc = new Document();
+    doc.add(new Field("field", "", Field.Store.NO, Field.Index.ANALYZED));
+    writer.addDocument(doc);
+    writer.addDocument(doc);
+    IndexReader r = writer.getReader();
+    writer.close();
+    IndexSearcher s = new IndexSearcher(r);
+    MultiPhraseQuery mpq = new MultiPhraseQuery();
+    //mpq.setSlop(1);
+
+    // NOTE: not great that if we do the else clause here we
+    // get different scores!  MultiPhraseQuery counts that
+    // phrase as occurring twice per doc (it should be 1, I
+    // think?).  This is because MultipleTermPositions is able to
+    // return the same position more than once (0, in this
+    // case):
+    if (true) {
+      mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
+      mpq.add(new Term[] {new Term("field", "a")}, 0);
+    } else {
+      mpq.add(new Term[] {new Term("field", "a")}, 0);
+      mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
+    }
+    TopDocs hits = s.search(mpq, 2);
+    assertEquals(2, hits.totalHits);
+    assertEquals(hits.scoreDocs[0].score, hits.scoreDocs[1].score, 1e-5);
+    /*
+    for(int hit=0;hit<hits.totalHits;hit++) {
+      ScoreDoc sd = hits.scoreDocs[hit];
+      System.out.println("  hit doc=" + sd.doc + " score=" + sd.score);
+    }
+    */
+    r.close();
+    dir.close();
+  }
+
+  private final static TokenAndPos[] INCR_0_DOC_TOKENS = new TokenAndPos[] {
+      new TokenAndPos("x", 0),
+      new TokenAndPos("a", 1),
+      new TokenAndPos("1", 1),
+      new TokenAndPos("m", 2), // not existing, relying on slop=2
+      new TokenAndPos("b", 3),
+      new TokenAndPos("1", 3),
+      new TokenAndPos("n", 4), // not existing, relying on slop=2
+      new TokenAndPos("c", 5),
+      new TokenAndPos("y", 6)
+  };
+  
+  private final static TokenAndPos[] INCR_0_QUERY_TOKENS_AND = new TokenAndPos[] {
+      new TokenAndPos("a", 0),
+      new TokenAndPos("1", 0),
+      new TokenAndPos("b", 1),
+      new TokenAndPos("1", 1),
+      new TokenAndPos("c", 2)
+  };
+  
+  private final static TokenAndPos[][] INCR_0_QUERY_TOKENS_AND_OR_MATCH = new TokenAndPos[][] {
+      { new TokenAndPos("a", 0) },
+      { new TokenAndPos("x", 0), new TokenAndPos("1", 0) },
+      { new TokenAndPos("b", 1) },
+      { new TokenAndPos("x", 1), new TokenAndPos("1", 1) },
+      { new TokenAndPos("c", 2) }
+  };
+  
+  private final static TokenAndPos[][] INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN = new TokenAndPos[][] {
+      { new TokenAndPos("x", 0) },
+      { new TokenAndPos("a", 0), new TokenAndPos("1", 0) },
+      { new TokenAndPos("x", 1) },
+      { new TokenAndPos("b", 1), new TokenAndPos("1", 1) },
+      { new TokenAndPos("c", 2) }
+  };
+  
+  /**
+   * using query parser, MPQ will be created, and will not be strict about having all query terms 
+   * in each position - one of each position is sufficient (OR logic)
+   */
+  public void testZeroPosIncrSloppyParsedAnd() throws IOException, ParseException {
+    QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new CannedAnalyzer(INCR_0_QUERY_TOKENS_AND));
+    final Query q = qp.parse("\"this text is acually ignored\"");
+    assertTrue("wrong query type!", q instanceof MultiPhraseQuery);
+    doTestZeroPosIncrSloppy(q, 0);
+    ((MultiPhraseQuery) q).setSlop(1);
+    doTestZeroPosIncrSloppy(q, 0);
+    ((MultiPhraseQuery) q).setSlop(2);
+    doTestZeroPosIncrSloppy(q, 1);
+  }
+  
+  private void doTestZeroPosIncrSloppy(Query q, int nExpected) throws IOException {
+    Directory dir = newDirectory(); // random dir
+    IndexWriterConfig cfg = newIndexWriterConfig(TEST_VERSION_CURRENT, new CannedAnalyzer(INCR_0_DOC_TOKENS));
+    IndexWriter writer = new IndexWriter(dir, cfg);
+    Document doc = new Document();
+    doc.add(new Field("field", "", Field.Store.NO, Field.Index.ANALYZED));
+    writer.addDocument(doc);
+    IndexReader r = IndexReader.open(writer,false);
+    writer.close();
+    IndexSearcher s = new IndexSearcher(r);
+    
+    if (VERBOSE) {
+      System.out.println("QUERY=" + q);
+    }
+    
+    TopDocs hits = s.search(q, 1);
+    assertEquals("wrong number of results", nExpected, hits.totalHits);
+    
+    if (VERBOSE) {
+      for(int hit=0;hit<hits.totalHits;hit++) {
+        ScoreDoc sd = hits.scoreDocs[hit];
+        System.out.println("  hit doc=" + sd.doc + " score=" + sd.score);
+      }
+    }
+    
+    r.close();
+    dir.close();
+  }
+
+  /**
+   * PQ AND Mode - Manually creating a phrase query
+   */
+  public void testZeroPosIncrSloppyPqAnd() throws IOException, ParseException {
+    final PhraseQuery pq = new PhraseQuery();
+    for (TokenAndPos tap : INCR_0_QUERY_TOKENS_AND) {
+      pq.add(new Term("field",tap.token), tap.pos);
+    }
+    doTestZeroPosIncrSloppy(pq, 0);
+    pq.setSlop(1);
+    doTestZeroPosIncrSloppy(pq, 0);
+    pq.setSlop(2);
+    doTestZeroPosIncrSloppy(pq, 1);
+  }
+
+  /**
+   * MPQ AND Mode - Manually creating a multiple phrase query
+   */
+  public void testZeroPosIncrSloppyMpqAnd() throws IOException, ParseException {
+    final MultiPhraseQuery mpq = new MultiPhraseQuery();
+    for (TokenAndPos tap : INCR_0_QUERY_TOKENS_AND) {
+      mpq.add(new Term[]{new Term("field",tap.token)}, tap.pos); //AND logic
+    }
+    doTestZeroPosIncrSloppy(mpq, 0);
+    mpq.setSlop(1);
+    doTestZeroPosIncrSloppy(mpq, 0);
+    mpq.setSlop(2);
+    doTestZeroPosIncrSloppy(mpq, 1);
+  }
+
+  /**
+   * MPQ Combined AND OR Mode - Manually creating a multiple phrase query
+   */
+  public void testZeroPosIncrSloppyMpqAndOrMatch() throws IOException, ParseException {
+    final MultiPhraseQuery mpq = new MultiPhraseQuery();
+    for (TokenAndPos tap[] : INCR_0_QUERY_TOKENS_AND_OR_MATCH) {
+      Term[] terms = tapTerms(tap);
+      final int pos = tap[0].pos;
+      mpq.add(terms, pos); //AND logic in pos, OR across lines 
+    }
+    doTestZeroPosIncrSloppy(mpq, 0);
+    mpq.setSlop(1);
+    doTestZeroPosIncrSloppy(mpq, 0);
+    mpq.setSlop(2);
+    doTestZeroPosIncrSloppy(mpq, 1);
+  }
+
+  /**
+   * MPQ Combined AND OR Mode - Manually creating a multiple phrase query - with no match
+   */
+  public void testZeroPosIncrSloppyMpqAndOrNoMatch() throws IOException, ParseException {
+    final MultiPhraseQuery mpq = new MultiPhraseQuery();
+    for (TokenAndPos tap[] : INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN) {
+      Term[] terms = tapTerms(tap);
+      final int pos = tap[0].pos;
+      mpq.add(terms, pos); //AND logic in pos, OR across lines 
+    }
+    doTestZeroPosIncrSloppy(mpq, 0);
+    mpq.setSlop(2);
+    doTestZeroPosIncrSloppy(mpq, 0);
+  }
+
+  private Term[] tapTerms(TokenAndPos[] tap) {
+    Term[] terms = new Term[tap.length];
+    for (int i=0; i<terms.length; i++) {
+      terms[i] = new Term("field",tap[i].token);
+    }
+    return terms;
+  }
+  
+}
+