--- /dev/null
+package org.apache.lucene.search.spans;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Payload;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.payloads.PayloadHelper;
+import org.apache.lucene.search.payloads.PayloadSpanUtil;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.LockObtainFailedException;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestPayloadSpans extends LuceneTestCase {
+ private IndexSearcher searcher;
+ private Similarity similarity = new DefaultSimilarity();
+ protected IndexReader indexReader;
+ private IndexReader closeIndexReader;
+ private Directory directory;
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ PayloadHelper helper = new PayloadHelper();
+ searcher = helper.setUp(random, similarity, 1000);
+ indexReader = searcher.getIndexReader();
+ }
+
+ public void testSpanTermQuery() throws Exception {
+ SpanTermQuery stq;
+ Spans spans;
+ stq = new SpanTermQuery(new Term(PayloadHelper.FIELD, "seventy"));
+ spans = stq.getSpans(indexReader);
+ assertTrue("spans is null and it shouldn't be", spans != null);
+ checkSpans(spans, 100, 1, 1, 1);
+
+ stq = new SpanTermQuery(new Term(PayloadHelper.NO_PAYLOAD_FIELD, "seventy"));
+ spans = stq.getSpans(indexReader);
+ assertTrue("spans is null and it shouldn't be", spans != null);
+ checkSpans(spans, 100, 0, 0, 0);
+ }
+
+ public void testSpanFirst() throws IOException {
+
+ SpanQuery match;
+ SpanFirstQuery sfq;
+ match = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one"));
+ sfq = new SpanFirstQuery(match, 2);
+ Spans spans = sfq.getSpans(indexReader);
+ checkSpans(spans, 109, 1, 1, 1);
+ //Test more complicated subclause
+ SpanQuery[] clauses = new SpanQuery[2];
+ clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one"));
+ clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "hundred"));
+ match = new SpanNearQuery(clauses, 0, true);
+ sfq = new SpanFirstQuery(match, 2);
+ checkSpans(sfq.getSpans(indexReader), 100, 2, 1, 1);
+
+ match = new SpanNearQuery(clauses, 0, false);
+ sfq = new SpanFirstQuery(match, 2);
+ checkSpans(sfq.getSpans(indexReader), 100, 2, 1, 1);
+
+ }
+
+ public void testSpanNot() throws Exception {
+ SpanQuery[] clauses = new SpanQuery[2];
+ clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one"));
+ clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "three"));
+ SpanQuery spq = new SpanNearQuery(clauses, 5, true);
+ SpanNotQuery snq = new SpanNotQuery(spq, new SpanTermQuery(new Term(PayloadHelper.FIELD, "two")));
+
+
+
+ Directory directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random, directory,
+ newIndexWriterConfig(TEST_VERSION_CURRENT, new PayloadAnalyzer()).setSimilarity(similarity));
+
+ Document doc = new Document();
+ doc.add(newField(PayloadHelper.FIELD, "one two three one four three",
+ Field.Store.YES, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+ IndexReader reader = writer.getReader();
+ writer.close();
+
+ checkSpans(snq.getSpans(reader), 1,new int[]{2});
+ reader.close();
+ directory.close();
+ }
+
+ public void testNestedSpans() throws Exception {
+ SpanTermQuery stq;
+ Spans spans;
+ IndexSearcher searcher = getSearcher();
+ stq = new SpanTermQuery(new Term(PayloadHelper.FIELD, "mark"));
+ spans = stq.getSpans(searcher.getIndexReader());
+ assertTrue("spans is null and it shouldn't be", spans != null);
+ checkSpans(spans, 0, null);
+
+
+ SpanQuery[] clauses = new SpanQuery[3];
+ clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "rr"));
+ clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "yy"));
+ clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "xx"));
+ SpanNearQuery spanNearQuery = new SpanNearQuery(clauses, 12, false);
+
+ spans = spanNearQuery.getSpans(searcher.getIndexReader());
+ assertTrue("spans is null and it shouldn't be", spans != null);
+ checkSpans(spans, 2, new int[]{3,3});
+
+
+ clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "xx"));
+ clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "rr"));
+ clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "yy"));
+
+ spanNearQuery = new SpanNearQuery(clauses, 6, true);
+
+
+ spans = spanNearQuery.getSpans(searcher.getIndexReader());
+ assertTrue("spans is null and it shouldn't be", spans != null);
+ checkSpans(spans, 1, new int[]{3});
+
+ clauses = new SpanQuery[2];
+
+ clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "xx"));
+ clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "rr"));
+
+ spanNearQuery = new SpanNearQuery(clauses, 6, true);
+
+ // xx within 6 of rr
+
+ SpanQuery[] clauses2 = new SpanQuery[2];
+
+ clauses2[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "yy"));
+ clauses2[1] = spanNearQuery;
+
+ SpanNearQuery nestedSpanNearQuery = new SpanNearQuery(clauses2, 6, false);
+
+ // yy within 6 of xx within 6 of rr
+
+ spans = nestedSpanNearQuery.getSpans(searcher.getIndexReader());
+ assertTrue("spans is null and it shouldn't be", spans != null);
+ checkSpans(spans, 2, new int[]{3,3});
+ searcher.close();
+ closeIndexReader.close();
+ directory.close();
+ }
+
+ public void testFirstClauseWithoutPayload() throws Exception {
+ Spans spans;
+ IndexSearcher searcher = getSearcher();
+
+ SpanQuery[] clauses = new SpanQuery[3];
+ clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "nopayload"));
+ clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "qq"));
+ clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "ss"));
+
+ SpanNearQuery spanNearQuery = new SpanNearQuery(clauses, 6, true);
+
+ SpanQuery[] clauses2 = new SpanQuery[2];
+
+ clauses2[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "pp"));
+ clauses2[1] = spanNearQuery;
+
+ SpanNearQuery snq = new SpanNearQuery(clauses2, 6, false);
+
+ SpanQuery[] clauses3 = new SpanQuery[2];
+
+ clauses3[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "np"));
+ clauses3[1] = snq;
+
+ SpanNearQuery nestedSpanNearQuery = new SpanNearQuery(clauses3, 6, false);
+
+ spans = nestedSpanNearQuery.getSpans(searcher.getIndexReader());
+ assertTrue("spans is null and it shouldn't be", spans != null);
+ checkSpans(spans, 1, new int[]{3});
+ searcher.close();
+ closeIndexReader.close();
+ directory.close();
+ }
+
+ public void testHeavilyNestedSpanQuery() throws Exception {
+ Spans spans;
+ IndexSearcher searcher = getSearcher();
+
+ SpanQuery[] clauses = new SpanQuery[3];
+ clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one"));
+ clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "two"));
+ clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "three"));
+
+ SpanNearQuery spanNearQuery = new SpanNearQuery(clauses, 5, true);
+
+ clauses = new SpanQuery[3];
+ clauses[0] = spanNearQuery;
+ clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "five"));
+ clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "six"));
+
+ SpanNearQuery spanNearQuery2 = new SpanNearQuery(clauses, 6, true);
+
+ SpanQuery[] clauses2 = new SpanQuery[2];
+ clauses2[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "eleven"));
+ clauses2[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "ten"));
+ SpanNearQuery spanNearQuery3 = new SpanNearQuery(clauses2, 2, false);
+
+ SpanQuery[] clauses3 = new SpanQuery[3];
+ clauses3[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "nine"));
+ clauses3[1] = spanNearQuery2;
+ clauses3[2] = spanNearQuery3;
+
+ SpanNearQuery nestedSpanNearQuery = new SpanNearQuery(clauses3, 6, false);
+
+ spans = nestedSpanNearQuery.getSpans(searcher.getIndexReader());
+ assertTrue("spans is null and it shouldn't be", spans != null);
+ checkSpans(spans, 2, new int[]{8, 8});
+ searcher.close();
+ closeIndexReader.close();
+ directory.close();
+ }
+
+ public void testShrinkToAfterShortestMatch() throws CorruptIndexException,
+ LockObtainFailedException, IOException {
+ Directory directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random, directory,
+ newIndexWriterConfig(TEST_VERSION_CURRENT, new TestPayloadAnalyzer()));
+
+ Document doc = new Document();
+ doc.add(new Field("content", new StringReader("a b c d e f g h i j a k")));
+ writer.addDocument(doc);
+
+ IndexReader reader = writer.getReader();
+ IndexSearcher is = newSearcher(reader);
+ writer.close();
+
+ SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
+ SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
+ SpanQuery[] sqs = { stq1, stq2 };
+ SpanNearQuery snq = new SpanNearQuery(sqs, 1, true);
+ Spans spans = snq.getSpans(is.getIndexReader());
+
+ TopDocs topDocs = is.search(snq, 1);
+ Set<String> payloadSet = new HashSet<String>();
+ for (int i = 0; i < topDocs.scoreDocs.length; i++) {
+ while (spans.next()) {
+ Collection<byte[]> payloads = spans.getPayload();
+
+ for (final byte [] payload : payloads) {
+ payloadSet.add(new String(payload));
+ }
+ }
+ }
+ assertEquals(2, payloadSet.size());
+ assertTrue(payloadSet.contains("a:Noise:10"));
+ assertTrue(payloadSet.contains("k:Noise:11"));
+ is.close();
+ reader.close();
+ directory.close();
+ }
+
+ public void testShrinkToAfterShortestMatch2() throws CorruptIndexException,
+ LockObtainFailedException, IOException {
+ Directory directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random, directory,
+ newIndexWriterConfig(TEST_VERSION_CURRENT, new TestPayloadAnalyzer()));
+
+ Document doc = new Document();
+ doc.add(new Field("content", new StringReader("a b a d k f a h i k a k")));
+ writer.addDocument(doc);
+ IndexReader reader = writer.getReader();
+ IndexSearcher is = newSearcher(reader);
+ writer.close();
+
+ SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
+ SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
+ SpanQuery[] sqs = { stq1, stq2 };
+ SpanNearQuery snq = new SpanNearQuery(sqs, 0, true);
+ Spans spans = snq.getSpans(is.getIndexReader());
+
+ TopDocs topDocs = is.search(snq, 1);
+ Set<String> payloadSet = new HashSet<String>();
+ for (int i = 0; i < topDocs.scoreDocs.length; i++) {
+ while (spans.next()) {
+ Collection<byte[]> payloads = spans.getPayload();
+ for (final byte[] payload : payloads) {
+ payloadSet.add(new String(payload));
+ }
+ }
+ }
+ assertEquals(2, payloadSet.size());
+ assertTrue(payloadSet.contains("a:Noise:10"));
+ assertTrue(payloadSet.contains("k:Noise:11"));
+ is.close();
+ reader.close();
+ directory.close();
+ }
+
+ public void testShrinkToAfterShortestMatch3() throws CorruptIndexException,
+ LockObtainFailedException, IOException {
+ Directory directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random, directory,
+ newIndexWriterConfig(TEST_VERSION_CURRENT, new TestPayloadAnalyzer()));
+
+ Document doc = new Document();
+ doc.add(new Field("content", new StringReader("j k a l f k k p a t a k l k t a")));
+ writer.addDocument(doc);
+ IndexReader reader = writer.getReader();
+ IndexSearcher is = newSearcher(reader);
+ writer.close();
+
+ SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
+ SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
+ SpanQuery[] sqs = { stq1, stq2 };
+ SpanNearQuery snq = new SpanNearQuery(sqs, 0, true);
+ Spans spans = snq.getSpans(is.getIndexReader());
+
+ TopDocs topDocs = is.search(snq, 1);
+ Set<String> payloadSet = new HashSet<String>();
+ for (int i = 0; i < topDocs.scoreDocs.length; i++) {
+ while (spans.next()) {
+ Collection<byte[]> payloads = spans.getPayload();
+
+ for (final byte [] payload : payloads) {
+ payloadSet.add(new String(payload));
+ }
+ }
+ }
+ assertEquals(2, payloadSet.size());
+ if(VERBOSE) {
+ for (final String payload : payloadSet)
+ System.out.println("match:" + payload);
+
+ }
+ assertTrue(payloadSet.contains("a:Noise:10"));
+ assertTrue(payloadSet.contains("k:Noise:11"));
+ is.close();
+ reader.close();
+ directory.close();
+ }
+
+ public void testPayloadSpanUtil() throws Exception {
+ Directory directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random, directory,
+ newIndexWriterConfig(TEST_VERSION_CURRENT, new PayloadAnalyzer()).setSimilarity(similarity));
+
+ Document doc = new Document();
+ doc.add(newField(PayloadHelper.FIELD,"xx rr yy mm pp", Field.Store.YES, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+
+ IndexReader reader = writer.getReader();
+ writer.close();
+ IndexSearcher searcher = newSearcher(reader);
+
+ PayloadSpanUtil psu = new PayloadSpanUtil(searcher.getIndexReader());
+
+ Collection<byte[]> payloads = psu.getPayloadsForQuery(new TermQuery(new Term(PayloadHelper.FIELD, "rr")));
+ if(VERBOSE)
+ System.out.println("Num payloads:" + payloads.size());
+ for (final byte [] bytes : payloads) {
+ if(VERBOSE)
+ System.out.println(new String(bytes));
+ }
+ searcher.close();
+ reader.close();
+ directory.close();
+ }
+
+ private void checkSpans(Spans spans, int expectedNumSpans, int expectedNumPayloads,
+ int expectedPayloadLength, int expectedFirstByte) throws IOException {
+ assertTrue("spans is null and it shouldn't be", spans != null);
+ //each position match should have a span associated with it, since there is just one underlying term query, there should
+ //only be one entry in the span
+ int seen = 0;
+ while (spans.next() == true)
+ {
+ //if we expect payloads, then isPayloadAvailable should be true
+ if (expectedNumPayloads > 0) {
+ assertTrue("isPayloadAvailable is not returning the correct value: " + spans.isPayloadAvailable()
+ + " and it should be: " + (expectedNumPayloads > 0),
+ spans.isPayloadAvailable() == true);
+ } else {
+ assertTrue("isPayloadAvailable should be false", spans.isPayloadAvailable() == false);
+ }
+ //See payload helper, for the PayloadHelper.FIELD field, there is a single byte payload at every token
+ if (spans.isPayloadAvailable()) {
+ Collection<byte[]> payload = spans.getPayload();
+ assertTrue("payload Size: " + payload.size() + " is not: " + expectedNumPayloads, payload.size() == expectedNumPayloads);
+ for (final byte [] thePayload : payload) {
+ assertTrue("payload[0] Size: " + thePayload.length + " is not: " + expectedPayloadLength,
+ thePayload.length == expectedPayloadLength);
+ assertTrue(thePayload[0] + " does not equal: " + expectedFirstByte, thePayload[0] == expectedFirstByte);
+
+ }
+
+ }
+ seen++;
+ }
+ assertTrue(seen + " does not equal: " + expectedNumSpans, seen == expectedNumSpans);
+ }
+
+ private IndexSearcher getSearcher() throws Exception {
+ directory = newDirectory();
+ String[] docs = new String[]{"xx rr yy mm pp","xx yy mm rr pp", "nopayload qq ss pp np", "one two three four five six seven eight nine ten eleven", "nine one two three four five six seven eight eleven ten"};
+ RandomIndexWriter writer = new RandomIndexWriter(random, directory,
+ newIndexWriterConfig(TEST_VERSION_CURRENT, new PayloadAnalyzer()).setSimilarity(similarity));
+
+ Document doc = null;
+ for(int i = 0; i < docs.length; i++) {
+ doc = new Document();
+ String docText = docs[i];
+ doc.add(newField(PayloadHelper.FIELD,docText, Field.Store.YES, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+ }
+
+ closeIndexReader = writer.getReader();
+ writer.close();
+
+ IndexSearcher searcher = newSearcher(closeIndexReader);
+ return searcher;
+ }
+
+ private void checkSpans(Spans spans, int numSpans, int[] numPayloads) throws IOException {
+ int cnt = 0;
+
+ while (spans.next() == true) {
+ if(VERBOSE)
+ System.out.println("\nSpans Dump --");
+ if (spans.isPayloadAvailable()) {
+ Collection<byte[]> payload = spans.getPayload();
+ if(VERBOSE)
+ System.out.println("payloads for span:" + payload.size());
+ for (final byte [] bytes : payload) {
+ if(VERBOSE)
+ System.out.println("doc:" + spans.doc() + " s:" + spans.start() + " e:" + spans.end() + " "
+ + new String(bytes));
+ }
+
+ assertEquals(numPayloads[cnt],payload.size());
+ } else {
+ assertFalse("Expected spans:" + numPayloads[cnt] + " found: 0",numPayloads.length > 0 && numPayloads[cnt] > 0 );
+ }
+ cnt++;
+ }
+
+ assertEquals(numSpans, cnt);
+ }
+
+ final class PayloadAnalyzer extends Analyzer {
+
+ @Override
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ TokenStream result = new LowerCaseTokenizer(TEST_VERSION_CURRENT, reader);
+ result = new PayloadFilter(result, fieldName);
+ return result;
+ }
+ }
+
+ final class PayloadFilter extends TokenFilter {
+ String fieldName;
+ int numSeen = 0;
+ Set<String> entities = new HashSet<String>();
+ Set<String> nopayload = new HashSet<String>();
+ int pos;
+ PayloadAttribute payloadAtt;
+ CharTermAttribute termAtt;
+ PositionIncrementAttribute posIncrAtt;
+
+ public PayloadFilter(TokenStream input, String fieldName) {
+ super(input);
+ this.fieldName = fieldName;
+ pos = 0;
+ entities.add("xx");
+ entities.add("one");
+ nopayload.add("nopayload");
+ nopayload.add("np");
+ termAtt = addAttribute(CharTermAttribute.class);
+ posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ payloadAtt = addAttribute(PayloadAttribute.class);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ String token = termAtt.toString();
+
+ if (!nopayload.contains(token)) {
+ if (entities.contains(token)) {
+ payloadAtt.setPayload(new Payload((token + ":Entity:"+ pos ).getBytes()));
+ } else {
+ payloadAtt.setPayload(new Payload((token + ":Noise:" + pos ).getBytes()));
+ }
+ }
+ pos += posIncrAtt.getPositionIncrement();
+ return true;
+ }
+ return false;
+ }
+ }
+
+ public final class TestPayloadAnalyzer extends Analyzer {
+
+ @Override
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ TokenStream result = new LowerCaseTokenizer(TEST_VERSION_CURRENT, reader);
+ result = new PayloadFilter(result, fieldName);
+ return result;
+ }
+ }
+}