pylucene 3.5.0-3
[pylucene.git] / lucene-java-3.5.0 / lucene / src / test / org / apache / lucene / search / spans / TestPayloadSpans.java
diff --git a/lucene-java-3.5.0/lucene/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java b/lucene-java-3.5.0/lucene/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java
new file mode 100644 (file)
index 0000000..d2793e6
--- /dev/null
@@ -0,0 +1,543 @@
+package org.apache.lucene.search.spans;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Payload;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.payloads.PayloadHelper;
+import org.apache.lucene.search.payloads.PayloadSpanUtil;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.LockObtainFailedException;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestPayloadSpans extends LuceneTestCase {
+  private IndexSearcher searcher;
+  private Similarity similarity = new DefaultSimilarity();
+  protected IndexReader indexReader;
+  private IndexReader closeIndexReader;
+  private Directory directory;
+
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    PayloadHelper helper = new PayloadHelper();
+    searcher = helper.setUp(random, similarity, 1000);
+    indexReader = searcher.getIndexReader();
+  }
+
+  public void testSpanTermQuery() throws Exception {
+    SpanTermQuery stq;
+    Spans spans;
+    stq = new SpanTermQuery(new Term(PayloadHelper.FIELD, "seventy"));
+    spans = stq.getSpans(indexReader);
+    assertTrue("spans is null and it shouldn't be", spans != null);
+    checkSpans(spans, 100, 1, 1, 1);
+
+    stq = new SpanTermQuery(new Term(PayloadHelper.NO_PAYLOAD_FIELD, "seventy"));  
+    spans = stq.getSpans(indexReader);
+    assertTrue("spans is null and it shouldn't be", spans != null);
+    checkSpans(spans, 100, 0, 0, 0);
+  }
+
+  public void testSpanFirst() throws IOException {
+
+    SpanQuery match;
+    SpanFirstQuery sfq;
+    match = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one"));
+    sfq = new SpanFirstQuery(match, 2);
+    Spans spans = sfq.getSpans(indexReader);
+    checkSpans(spans, 109, 1, 1, 1);
+    //Test more complicated subclause
+    SpanQuery[] clauses = new SpanQuery[2];
+    clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one"));
+    clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "hundred"));
+    match = new SpanNearQuery(clauses, 0, true);
+    sfq = new SpanFirstQuery(match, 2);
+    checkSpans(sfq.getSpans(indexReader), 100, 2, 1, 1);
+
+    match = new SpanNearQuery(clauses, 0, false);
+    sfq = new SpanFirstQuery(match, 2);
+    checkSpans(sfq.getSpans(indexReader), 100, 2, 1, 1);
+    
+  }
+  
+  public void testSpanNot() throws Exception {
+    SpanQuery[] clauses = new SpanQuery[2];
+    clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one"));
+    clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "three"));
+    SpanQuery spq = new SpanNearQuery(clauses, 5, true);
+    SpanNotQuery snq = new SpanNotQuery(spq, new SpanTermQuery(new Term(PayloadHelper.FIELD, "two")));
+
+
+
+    Directory directory = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random, directory,
+                                                     newIndexWriterConfig(TEST_VERSION_CURRENT, new PayloadAnalyzer()).setSimilarity(similarity));
+
+    Document doc = new Document();
+    doc.add(newField(PayloadHelper.FIELD, "one two three one four three",
+        Field.Store.YES, Field.Index.ANALYZED));
+    writer.addDocument(doc);
+    IndexReader reader = writer.getReader();
+    writer.close();
+
+    checkSpans(snq.getSpans(reader), 1,new int[]{2});
+    reader.close();
+    directory.close();
+  }
+  
+  public void testNestedSpans() throws Exception {
+    SpanTermQuery stq;
+    Spans spans;
+    IndexSearcher searcher = getSearcher();
+    stq = new SpanTermQuery(new Term(PayloadHelper.FIELD, "mark"));
+    spans = stq.getSpans(searcher.getIndexReader());
+    assertTrue("spans is null and it shouldn't be", spans != null);
+    checkSpans(spans, 0, null);
+
+
+    SpanQuery[] clauses = new SpanQuery[3];
+    clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "rr"));
+    clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "yy"));
+    clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "xx"));
+    SpanNearQuery spanNearQuery = new SpanNearQuery(clauses, 12, false);
+
+    spans = spanNearQuery.getSpans(searcher.getIndexReader());
+    assertTrue("spans is null and it shouldn't be", spans != null);
+    checkSpans(spans, 2, new int[]{3,3});
+
+     
+    clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "xx"));
+    clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "rr"));
+    clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "yy"));
+
+    spanNearQuery = new SpanNearQuery(clauses, 6, true);
+   
+    
+    spans = spanNearQuery.getSpans(searcher.getIndexReader());
+    assertTrue("spans is null and it shouldn't be", spans != null);
+    checkSpans(spans, 1, new int[]{3});
+     
+    clauses = new SpanQuery[2];
+     
+    clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "xx"));
+    clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "rr"));
+
+    spanNearQuery = new SpanNearQuery(clauses, 6, true);
+     
+    // xx within 6 of rr
+    
+    SpanQuery[] clauses2 = new SpanQuery[2];
+     
+    clauses2[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "yy"));
+    clauses2[1] = spanNearQuery;
+     
+    SpanNearQuery nestedSpanNearQuery = new SpanNearQuery(clauses2, 6, false);
+    
+    // yy within 6 of xx within 6 of rr
+
+    spans = nestedSpanNearQuery.getSpans(searcher.getIndexReader());
+    assertTrue("spans is null and it shouldn't be", spans != null);
+    checkSpans(spans, 2, new int[]{3,3});
+    searcher.close();
+    closeIndexReader.close();
+    directory.close();
+  }
+  
+  public void testFirstClauseWithoutPayload() throws Exception {
+    Spans spans;
+    IndexSearcher searcher = getSearcher();
+
+    SpanQuery[] clauses = new SpanQuery[3];
+    clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "nopayload"));
+    clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "qq"));
+    clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "ss"));
+
+    SpanNearQuery spanNearQuery = new SpanNearQuery(clauses, 6, true);
+    
+    SpanQuery[] clauses2 = new SpanQuery[2];
+     
+    clauses2[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "pp"));
+    clauses2[1] = spanNearQuery;
+
+    SpanNearQuery snq = new SpanNearQuery(clauses2, 6, false);
+    
+    SpanQuery[] clauses3 = new SpanQuery[2];
+     
+    clauses3[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "np"));
+    clauses3[1] = snq;
+     
+    SpanNearQuery nestedSpanNearQuery = new SpanNearQuery(clauses3, 6, false);
+
+    spans = nestedSpanNearQuery.getSpans(searcher.getIndexReader());
+    assertTrue("spans is null and it shouldn't be", spans != null);
+    checkSpans(spans, 1, new int[]{3});
+    searcher.close();
+    closeIndexReader.close();
+    directory.close();
+  }
+  
+  public void testHeavilyNestedSpanQuery() throws Exception {
+    Spans spans;
+    IndexSearcher searcher = getSearcher();
+
+    SpanQuery[] clauses = new SpanQuery[3];
+    clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one"));
+    clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "two"));
+    clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "three"));
+
+    SpanNearQuery spanNearQuery = new SpanNearQuery(clauses, 5, true);
+   
+    clauses = new SpanQuery[3];
+    clauses[0] = spanNearQuery; 
+    clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "five"));
+    clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "six"));
+
+    SpanNearQuery spanNearQuery2 = new SpanNearQuery(clauses, 6, true);
+     
+    SpanQuery[] clauses2 = new SpanQuery[2];
+    clauses2[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "eleven"));
+    clauses2[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "ten"));
+    SpanNearQuery spanNearQuery3 = new SpanNearQuery(clauses2, 2, false);
+    
+    SpanQuery[] clauses3 = new SpanQuery[3];
+    clauses3[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "nine"));
+    clauses3[1] = spanNearQuery2;
+    clauses3[2] = spanNearQuery3;
+     
+    SpanNearQuery nestedSpanNearQuery = new SpanNearQuery(clauses3, 6, false);
+
+    spans = nestedSpanNearQuery.getSpans(searcher.getIndexReader());
+    assertTrue("spans is null and it shouldn't be", spans != null);
+    checkSpans(spans, 2, new int[]{8, 8});
+    searcher.close();
+    closeIndexReader.close();
+    directory.close();
+  }
+  
+  public void testShrinkToAfterShortestMatch() throws CorruptIndexException,
+      LockObtainFailedException, IOException {
+    Directory directory = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random, directory,
+                                                     newIndexWriterConfig(TEST_VERSION_CURRENT, new TestPayloadAnalyzer()));
+
+    Document doc = new Document();
+    doc.add(new Field("content", new StringReader("a b c d e f g h i j a k")));
+    writer.addDocument(doc);
+
+    IndexReader reader = writer.getReader();
+    IndexSearcher is = newSearcher(reader);
+    writer.close();
+
+    SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
+    SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
+    SpanQuery[] sqs = { stq1, stq2 };
+    SpanNearQuery snq = new SpanNearQuery(sqs, 1, true);
+    Spans spans = snq.getSpans(is.getIndexReader());
+
+    TopDocs topDocs = is.search(snq, 1);
+    Set<String> payloadSet = new HashSet<String>();
+    for (int i = 0; i < topDocs.scoreDocs.length; i++) {
+      while (spans.next()) {
+        Collection<byte[]> payloads = spans.getPayload();
+
+        for (final byte [] payload : payloads) {
+          payloadSet.add(new String(payload));
+        }
+      }
+    }
+    assertEquals(2, payloadSet.size());
+    assertTrue(payloadSet.contains("a:Noise:10"));
+    assertTrue(payloadSet.contains("k:Noise:11"));
+    is.close();
+    reader.close();
+    directory.close();
+  }
+  
+  public void testShrinkToAfterShortestMatch2() throws CorruptIndexException,
+      LockObtainFailedException, IOException {
+    Directory directory = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random, directory,
+                                                     newIndexWriterConfig(TEST_VERSION_CURRENT, new TestPayloadAnalyzer()));
+
+    Document doc = new Document();
+    doc.add(new Field("content", new StringReader("a b a d k f a h i k a k")));
+    writer.addDocument(doc);
+    IndexReader reader = writer.getReader();
+    IndexSearcher is = newSearcher(reader);
+    writer.close();
+
+    SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
+    SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
+    SpanQuery[] sqs = { stq1, stq2 };
+    SpanNearQuery snq = new SpanNearQuery(sqs, 0, true);
+    Spans spans = snq.getSpans(is.getIndexReader());
+
+    TopDocs topDocs = is.search(snq, 1);
+    Set<String> payloadSet = new HashSet<String>();
+    for (int i = 0; i < topDocs.scoreDocs.length; i++) {
+      while (spans.next()) {
+        Collection<byte[]> payloads = spans.getPayload();
+        for (final byte[] payload : payloads) {
+          payloadSet.add(new String(payload));
+        }
+      }
+    }
+    assertEquals(2, payloadSet.size());
+    assertTrue(payloadSet.contains("a:Noise:10"));
+    assertTrue(payloadSet.contains("k:Noise:11"));
+    is.close();
+    reader.close();
+    directory.close();
+  }
+  
+  public void testShrinkToAfterShortestMatch3() throws CorruptIndexException,
+      LockObtainFailedException, IOException {
+    Directory directory = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random, directory,
+                                                     newIndexWriterConfig(TEST_VERSION_CURRENT, new TestPayloadAnalyzer()));
+
+    Document doc = new Document();
+    doc.add(new Field("content", new StringReader("j k a l f k k p a t a k l k t a")));
+    writer.addDocument(doc);
+    IndexReader reader = writer.getReader();
+    IndexSearcher is = newSearcher(reader);
+    writer.close();
+
+    SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
+    SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
+    SpanQuery[] sqs = { stq1, stq2 };
+    SpanNearQuery snq = new SpanNearQuery(sqs, 0, true);
+    Spans spans = snq.getSpans(is.getIndexReader());
+
+    TopDocs topDocs = is.search(snq, 1);
+    Set<String> payloadSet = new HashSet<String>();
+    for (int i = 0; i < topDocs.scoreDocs.length; i++) {
+      while (spans.next()) {
+        Collection<byte[]> payloads = spans.getPayload();
+
+        for (final byte [] payload : payloads) {
+          payloadSet.add(new String(payload));
+        }
+      }
+    }
+    assertEquals(2, payloadSet.size());
+    if(VERBOSE) {
+      for (final String payload : payloadSet)
+        System.out.println("match:" +  payload);
+      
+    }
+    assertTrue(payloadSet.contains("a:Noise:10"));
+    assertTrue(payloadSet.contains("k:Noise:11"));
+    is.close();
+    reader.close();
+    directory.close();
+  }
+  
+  public void testPayloadSpanUtil() throws Exception {
+    Directory directory = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random, directory,
+                                                     newIndexWriterConfig(TEST_VERSION_CURRENT, new PayloadAnalyzer()).setSimilarity(similarity));
+
+    Document doc = new Document();
+    doc.add(newField(PayloadHelper.FIELD,"xx rr yy mm  pp", Field.Store.YES, Field.Index.ANALYZED));
+    writer.addDocument(doc);
+  
+    IndexReader reader = writer.getReader();
+    writer.close();
+    IndexSearcher searcher = newSearcher(reader);
+
+    PayloadSpanUtil psu = new PayloadSpanUtil(searcher.getIndexReader());
+    
+    Collection<byte[]> payloads = psu.getPayloadsForQuery(new TermQuery(new Term(PayloadHelper.FIELD, "rr")));
+    if(VERBOSE)
+      System.out.println("Num payloads:" + payloads.size());
+    for (final byte [] bytes : payloads) {
+      if(VERBOSE)
+        System.out.println(new String(bytes));
+    }
+    searcher.close();
+    reader.close();
+    directory.close();
+  }
+
+  private void checkSpans(Spans spans, int expectedNumSpans, int expectedNumPayloads,
+                          int expectedPayloadLength, int expectedFirstByte) throws IOException {
+    assertTrue("spans is null and it shouldn't be", spans != null);
+    //each position match should have a span associated with it, since there is just one underlying term query, there should
+    //only be one entry in the span
+    int seen = 0;
+    while (spans.next() == true)
+    {
+      //if we expect payloads, then isPayloadAvailable should be true
+      if (expectedNumPayloads > 0) {
+        assertTrue("isPayloadAvailable is not returning the correct value: " + spans.isPayloadAvailable()
+                + " and it should be: " + (expectedNumPayloads >  0),
+                spans.isPayloadAvailable() == true);
+      } else {
+        assertTrue("isPayloadAvailable should be false", spans.isPayloadAvailable() == false);
+      }
+      //See payload helper, for the PayloadHelper.FIELD field, there is a single byte payload at every token
+      if (spans.isPayloadAvailable()) {
+        Collection<byte[]> payload = spans.getPayload();
+        assertTrue("payload Size: " + payload.size() + " is not: " + expectedNumPayloads, payload.size() == expectedNumPayloads);
+        for (final byte [] thePayload : payload) {
+          assertTrue("payload[0] Size: " + thePayload.length + " is not: " + expectedPayloadLength,
+                  thePayload.length == expectedPayloadLength);
+          assertTrue(thePayload[0] + " does not equal: " + expectedFirstByte, thePayload[0] == expectedFirstByte);
+
+        }
+
+      }
+      seen++;
+    }
+    assertTrue(seen + " does not equal: " + expectedNumSpans, seen == expectedNumSpans);
+  }
+  
+  private IndexSearcher getSearcher() throws Exception {
+    directory = newDirectory();
+    String[] docs = new String[]{"xx rr yy mm  pp","xx yy mm rr pp", "nopayload qq ss pp np", "one two three four five six seven eight nine ten eleven", "nine one two three four five six seven eight eleven ten"};
+    RandomIndexWriter writer = new RandomIndexWriter(random, directory,
+                                                     newIndexWriterConfig(TEST_VERSION_CURRENT, new PayloadAnalyzer()).setSimilarity(similarity));
+
+    Document doc = null;
+    for(int i = 0; i < docs.length; i++) {
+      doc = new Document();
+      String docText = docs[i];
+      doc.add(newField(PayloadHelper.FIELD,docText, Field.Store.YES, Field.Index.ANALYZED));
+      writer.addDocument(doc);
+    }
+
+    closeIndexReader = writer.getReader();
+    writer.close();
+
+    IndexSearcher searcher = newSearcher(closeIndexReader);
+    return searcher;
+  }
+  
+  private void checkSpans(Spans spans, int numSpans, int[] numPayloads) throws IOException {
+    int cnt = 0;
+
+    while (spans.next() == true) {
+      if(VERBOSE)
+        System.out.println("\nSpans Dump --");
+      if (spans.isPayloadAvailable()) {
+        Collection<byte[]> payload = spans.getPayload();
+        if(VERBOSE)
+          System.out.println("payloads for span:" + payload.size());
+        for (final byte [] bytes : payload) {
+          if(VERBOSE)
+            System.out.println("doc:" + spans.doc() + " s:" + spans.start() + " e:" + spans.end() + " "
+              + new String(bytes));
+        }
+
+        assertEquals(numPayloads[cnt],payload.size());
+      } else {
+        assertFalse("Expected spans:" + numPayloads[cnt] + " found: 0",numPayloads.length > 0 && numPayloads[cnt] > 0 );
+      }
+      cnt++;
+    }
+
+    assertEquals(numSpans, cnt);
+  }
+
+  final class PayloadAnalyzer extends Analyzer {
+
+    @Override
+    public TokenStream tokenStream(String fieldName, Reader reader) {
+      TokenStream result = new LowerCaseTokenizer(TEST_VERSION_CURRENT, reader);
+      result = new PayloadFilter(result, fieldName);
+      return result;
+    }
+  }
+
+  final class PayloadFilter extends TokenFilter {
+    String fieldName;
+    int numSeen = 0;
+    Set<String> entities = new HashSet<String>();
+    Set<String> nopayload = new HashSet<String>();
+    int pos;
+    PayloadAttribute payloadAtt;
+    CharTermAttribute termAtt;
+    PositionIncrementAttribute posIncrAtt;
+
+    public PayloadFilter(TokenStream input, String fieldName) {
+      super(input);
+      this.fieldName = fieldName;
+      pos = 0;
+      entities.add("xx");
+      entities.add("one");
+      nopayload.add("nopayload");
+      nopayload.add("np");
+      termAtt = addAttribute(CharTermAttribute.class);
+      posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+      payloadAtt = addAttribute(PayloadAttribute.class);
+    }
+
+    @Override
+    public boolean incrementToken() throws IOException {
+      if (input.incrementToken()) {
+        String token = termAtt.toString();
+
+        if (!nopayload.contains(token)) {
+          if (entities.contains(token)) {
+            payloadAtt.setPayload(new Payload((token + ":Entity:"+ pos ).getBytes()));
+          } else {
+            payloadAtt.setPayload(new Payload((token + ":Noise:" + pos ).getBytes()));
+          }
+        }
+        pos += posIncrAtt.getPositionIncrement();
+        return true;
+      }
+      return false;
+    }
+  }
+  
+  public final class TestPayloadAnalyzer extends Analyzer {
+
+    @Override
+    public TokenStream tokenStream(String fieldName, Reader reader) {
+      TokenStream result = new LowerCaseTokenizer(TEST_VERSION_CURRENT, reader);
+      result = new PayloadFilter(result, fieldName);
+      return result;
+    }
+  }
+}