+++ /dev/null
-package org.apache.lucene.search.spans;
-
-/**
- * Copyright 2004 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.Reader;
-import java.io.StringReader;
-import java.util.Collection;
-import java.util.HashSet;
-import java.util.Set;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.LowerCaseTokenizer;
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.index.CorruptIndexException;
-import org.apache.lucene.index.RandomIndexWriter;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Payload;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.search.DefaultSimilarity;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.Similarity;
-import org.apache.lucene.search.TermQuery;
-import org.apache.lucene.search.TopDocs;
-import org.apache.lucene.search.payloads.PayloadHelper;
-import org.apache.lucene.search.payloads.PayloadSpanUtil;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.LockObtainFailedException;
-import org.apache.lucene.util.LuceneTestCase;
-
-public class TestPayloadSpans extends LuceneTestCase {
- private IndexSearcher searcher;
- private Similarity similarity = new DefaultSimilarity();
- protected IndexReader indexReader;
- private IndexReader closeIndexReader;
- private Directory directory;
-
- @Override
- public void setUp() throws Exception {
- super.setUp();
- PayloadHelper helper = new PayloadHelper();
- searcher = helper.setUp(random, similarity, 1000);
- indexReader = searcher.getIndexReader();
- }
-
- public void testSpanTermQuery() throws Exception {
- SpanTermQuery stq;
- Spans spans;
- stq = new SpanTermQuery(new Term(PayloadHelper.FIELD, "seventy"));
- spans = stq.getSpans(indexReader);
- assertTrue("spans is null and it shouldn't be", spans != null);
- checkSpans(spans, 100, 1, 1, 1);
-
- stq = new SpanTermQuery(new Term(PayloadHelper.NO_PAYLOAD_FIELD, "seventy"));
- spans = stq.getSpans(indexReader);
- assertTrue("spans is null and it shouldn't be", spans != null);
- checkSpans(spans, 100, 0, 0, 0);
- }
-
- public void testSpanFirst() throws IOException {
-
- SpanQuery match;
- SpanFirstQuery sfq;
- match = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one"));
- sfq = new SpanFirstQuery(match, 2);
- Spans spans = sfq.getSpans(indexReader);
- checkSpans(spans, 109, 1, 1, 1);
- //Test more complicated subclause
- SpanQuery[] clauses = new SpanQuery[2];
- clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one"));
- clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "hundred"));
- match = new SpanNearQuery(clauses, 0, true);
- sfq = new SpanFirstQuery(match, 2);
- checkSpans(sfq.getSpans(indexReader), 100, 2, 1, 1);
-
- match = new SpanNearQuery(clauses, 0, false);
- sfq = new SpanFirstQuery(match, 2);
- checkSpans(sfq.getSpans(indexReader), 100, 2, 1, 1);
-
- }
-
- public void testSpanNot() throws Exception {
- SpanQuery[] clauses = new SpanQuery[2];
- clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one"));
- clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "three"));
- SpanQuery spq = new SpanNearQuery(clauses, 5, true);
- SpanNotQuery snq = new SpanNotQuery(spq, new SpanTermQuery(new Term(PayloadHelper.FIELD, "two")));
-
-
-
- Directory directory = newDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(random, directory,
- newIndexWriterConfig(TEST_VERSION_CURRENT, new PayloadAnalyzer()).setSimilarity(similarity));
-
- Document doc = new Document();
- doc.add(newField(PayloadHelper.FIELD, "one two three one four three",
- Field.Store.YES, Field.Index.ANALYZED));
- writer.addDocument(doc);
- IndexReader reader = writer.getReader();
- writer.close();
-
- checkSpans(snq.getSpans(reader), 1,new int[]{2});
- reader.close();
- directory.close();
- }
-
- public void testNestedSpans() throws Exception {
- SpanTermQuery stq;
- Spans spans;
- IndexSearcher searcher = getSearcher();
- stq = new SpanTermQuery(new Term(PayloadHelper.FIELD, "mark"));
- spans = stq.getSpans(searcher.getIndexReader());
- assertTrue("spans is null and it shouldn't be", spans != null);
- checkSpans(spans, 0, null);
-
-
- SpanQuery[] clauses = new SpanQuery[3];
- clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "rr"));
- clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "yy"));
- clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "xx"));
- SpanNearQuery spanNearQuery = new SpanNearQuery(clauses, 12, false);
-
- spans = spanNearQuery.getSpans(searcher.getIndexReader());
- assertTrue("spans is null and it shouldn't be", spans != null);
- checkSpans(spans, 2, new int[]{3,3});
-
-
- clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "xx"));
- clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "rr"));
- clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "yy"));
-
- spanNearQuery = new SpanNearQuery(clauses, 6, true);
-
-
- spans = spanNearQuery.getSpans(searcher.getIndexReader());
- assertTrue("spans is null and it shouldn't be", spans != null);
- checkSpans(spans, 1, new int[]{3});
-
- clauses = new SpanQuery[2];
-
- clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "xx"));
- clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "rr"));
-
- spanNearQuery = new SpanNearQuery(clauses, 6, true);
-
- // xx within 6 of rr
-
- SpanQuery[] clauses2 = new SpanQuery[2];
-
- clauses2[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "yy"));
- clauses2[1] = spanNearQuery;
-
- SpanNearQuery nestedSpanNearQuery = new SpanNearQuery(clauses2, 6, false);
-
- // yy within 6 of xx within 6 of rr
-
- spans = nestedSpanNearQuery.getSpans(searcher.getIndexReader());
- assertTrue("spans is null and it shouldn't be", spans != null);
- checkSpans(spans, 2, new int[]{3,3});
- searcher.close();
- closeIndexReader.close();
- directory.close();
- }
-
- public void testFirstClauseWithoutPayload() throws Exception {
- Spans spans;
- IndexSearcher searcher = getSearcher();
-
- SpanQuery[] clauses = new SpanQuery[3];
- clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "nopayload"));
- clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "qq"));
- clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "ss"));
-
- SpanNearQuery spanNearQuery = new SpanNearQuery(clauses, 6, true);
-
- SpanQuery[] clauses2 = new SpanQuery[2];
-
- clauses2[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "pp"));
- clauses2[1] = spanNearQuery;
-
- SpanNearQuery snq = new SpanNearQuery(clauses2, 6, false);
-
- SpanQuery[] clauses3 = new SpanQuery[2];
-
- clauses3[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "np"));
- clauses3[1] = snq;
-
- SpanNearQuery nestedSpanNearQuery = new SpanNearQuery(clauses3, 6, false);
-
- spans = nestedSpanNearQuery.getSpans(searcher.getIndexReader());
- assertTrue("spans is null and it shouldn't be", spans != null);
- checkSpans(spans, 1, new int[]{3});
- searcher.close();
- closeIndexReader.close();
- directory.close();
- }
-
- public void testHeavilyNestedSpanQuery() throws Exception {
- Spans spans;
- IndexSearcher searcher = getSearcher();
-
- SpanQuery[] clauses = new SpanQuery[3];
- clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one"));
- clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "two"));
- clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "three"));
-
- SpanNearQuery spanNearQuery = new SpanNearQuery(clauses, 5, true);
-
- clauses = new SpanQuery[3];
- clauses[0] = spanNearQuery;
- clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "five"));
- clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "six"));
-
- SpanNearQuery spanNearQuery2 = new SpanNearQuery(clauses, 6, true);
-
- SpanQuery[] clauses2 = new SpanQuery[2];
- clauses2[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "eleven"));
- clauses2[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "ten"));
- SpanNearQuery spanNearQuery3 = new SpanNearQuery(clauses2, 2, false);
-
- SpanQuery[] clauses3 = new SpanQuery[3];
- clauses3[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "nine"));
- clauses3[1] = spanNearQuery2;
- clauses3[2] = spanNearQuery3;
-
- SpanNearQuery nestedSpanNearQuery = new SpanNearQuery(clauses3, 6, false);
-
- spans = nestedSpanNearQuery.getSpans(searcher.getIndexReader());
- assertTrue("spans is null and it shouldn't be", spans != null);
- checkSpans(spans, 2, new int[]{8, 8});
- searcher.close();
- closeIndexReader.close();
- directory.close();
- }
-
- public void testShrinkToAfterShortestMatch() throws CorruptIndexException,
- LockObtainFailedException, IOException {
- Directory directory = newDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(random, directory,
- newIndexWriterConfig(TEST_VERSION_CURRENT, new TestPayloadAnalyzer()));
-
- Document doc = new Document();
- doc.add(new Field("content", new StringReader("a b c d e f g h i j a k")));
- writer.addDocument(doc);
-
- IndexReader reader = writer.getReader();
- IndexSearcher is = newSearcher(reader);
- writer.close();
-
- SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
- SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
- SpanQuery[] sqs = { stq1, stq2 };
- SpanNearQuery snq = new SpanNearQuery(sqs, 1, true);
- Spans spans = snq.getSpans(is.getIndexReader());
-
- TopDocs topDocs = is.search(snq, 1);
- Set<String> payloadSet = new HashSet<String>();
- for (int i = 0; i < topDocs.scoreDocs.length; i++) {
- while (spans.next()) {
- Collection<byte[]> payloads = spans.getPayload();
-
- for (final byte [] payload : payloads) {
- payloadSet.add(new String(payload));
- }
- }
- }
- assertEquals(2, payloadSet.size());
- assertTrue(payloadSet.contains("a:Noise:10"));
- assertTrue(payloadSet.contains("k:Noise:11"));
- is.close();
- reader.close();
- directory.close();
- }
-
- public void testShrinkToAfterShortestMatch2() throws CorruptIndexException,
- LockObtainFailedException, IOException {
- Directory directory = newDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(random, directory,
- newIndexWriterConfig(TEST_VERSION_CURRENT, new TestPayloadAnalyzer()));
-
- Document doc = new Document();
- doc.add(new Field("content", new StringReader("a b a d k f a h i k a k")));
- writer.addDocument(doc);
- IndexReader reader = writer.getReader();
- IndexSearcher is = newSearcher(reader);
- writer.close();
-
- SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
- SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
- SpanQuery[] sqs = { stq1, stq2 };
- SpanNearQuery snq = new SpanNearQuery(sqs, 0, true);
- Spans spans = snq.getSpans(is.getIndexReader());
-
- TopDocs topDocs = is.search(snq, 1);
- Set<String> payloadSet = new HashSet<String>();
- for (int i = 0; i < topDocs.scoreDocs.length; i++) {
- while (spans.next()) {
- Collection<byte[]> payloads = spans.getPayload();
- for (final byte[] payload : payloads) {
- payloadSet.add(new String(payload));
- }
- }
- }
- assertEquals(2, payloadSet.size());
- assertTrue(payloadSet.contains("a:Noise:10"));
- assertTrue(payloadSet.contains("k:Noise:11"));
- is.close();
- reader.close();
- directory.close();
- }
-
- public void testShrinkToAfterShortestMatch3() throws CorruptIndexException,
- LockObtainFailedException, IOException {
- Directory directory = newDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(random, directory,
- newIndexWriterConfig(TEST_VERSION_CURRENT, new TestPayloadAnalyzer()));
-
- Document doc = new Document();
- doc.add(new Field("content", new StringReader("j k a l f k k p a t a k l k t a")));
- writer.addDocument(doc);
- IndexReader reader = writer.getReader();
- IndexSearcher is = newSearcher(reader);
- writer.close();
-
- SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
- SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
- SpanQuery[] sqs = { stq1, stq2 };
- SpanNearQuery snq = new SpanNearQuery(sqs, 0, true);
- Spans spans = snq.getSpans(is.getIndexReader());
-
- TopDocs topDocs = is.search(snq, 1);
- Set<String> payloadSet = new HashSet<String>();
- for (int i = 0; i < topDocs.scoreDocs.length; i++) {
- while (spans.next()) {
- Collection<byte[]> payloads = spans.getPayload();
-
- for (final byte [] payload : payloads) {
- payloadSet.add(new String(payload));
- }
- }
- }
- assertEquals(2, payloadSet.size());
- if(VERBOSE) {
- for (final String payload : payloadSet)
- System.out.println("match:" + payload);
-
- }
- assertTrue(payloadSet.contains("a:Noise:10"));
- assertTrue(payloadSet.contains("k:Noise:11"));
- is.close();
- reader.close();
- directory.close();
- }
-
- public void testPayloadSpanUtil() throws Exception {
- Directory directory = newDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(random, directory,
- newIndexWriterConfig(TEST_VERSION_CURRENT, new PayloadAnalyzer()).setSimilarity(similarity));
-
- Document doc = new Document();
- doc.add(newField(PayloadHelper.FIELD,"xx rr yy mm pp", Field.Store.YES, Field.Index.ANALYZED));
- writer.addDocument(doc);
-
- IndexReader reader = writer.getReader();
- writer.close();
- IndexSearcher searcher = newSearcher(reader);
-
- PayloadSpanUtil psu = new PayloadSpanUtil(searcher.getIndexReader());
-
- Collection<byte[]> payloads = psu.getPayloadsForQuery(new TermQuery(new Term(PayloadHelper.FIELD, "rr")));
- if(VERBOSE)
- System.out.println("Num payloads:" + payloads.size());
- for (final byte [] bytes : payloads) {
- if(VERBOSE)
- System.out.println(new String(bytes));
- }
- searcher.close();
- reader.close();
- directory.close();
- }
-
- private void checkSpans(Spans spans, int expectedNumSpans, int expectedNumPayloads,
- int expectedPayloadLength, int expectedFirstByte) throws IOException {
- assertTrue("spans is null and it shouldn't be", spans != null);
- //each position match should have a span associated with it, since there is just one underlying term query, there should
- //only be one entry in the span
- int seen = 0;
- while (spans.next() == true)
- {
- //if we expect payloads, then isPayloadAvailable should be true
- if (expectedNumPayloads > 0) {
- assertTrue("isPayloadAvailable is not returning the correct value: " + spans.isPayloadAvailable()
- + " and it should be: " + (expectedNumPayloads > 0),
- spans.isPayloadAvailable() == true);
- } else {
- assertTrue("isPayloadAvailable should be false", spans.isPayloadAvailable() == false);
- }
- //See payload helper, for the PayloadHelper.FIELD field, there is a single byte payload at every token
- if (spans.isPayloadAvailable()) {
- Collection<byte[]> payload = spans.getPayload();
- assertTrue("payload Size: " + payload.size() + " is not: " + expectedNumPayloads, payload.size() == expectedNumPayloads);
- for (final byte [] thePayload : payload) {
- assertTrue("payload[0] Size: " + thePayload.length + " is not: " + expectedPayloadLength,
- thePayload.length == expectedPayloadLength);
- assertTrue(thePayload[0] + " does not equal: " + expectedFirstByte, thePayload[0] == expectedFirstByte);
-
- }
-
- }
- seen++;
- }
- assertTrue(seen + " does not equal: " + expectedNumSpans, seen == expectedNumSpans);
- }
-
- private IndexSearcher getSearcher() throws Exception {
- directory = newDirectory();
- String[] docs = new String[]{"xx rr yy mm pp","xx yy mm rr pp", "nopayload qq ss pp np", "one two three four five six seven eight nine ten eleven", "nine one two three four five six seven eight eleven ten"};
- RandomIndexWriter writer = new RandomIndexWriter(random, directory,
- newIndexWriterConfig(TEST_VERSION_CURRENT, new PayloadAnalyzer()).setSimilarity(similarity));
-
- Document doc = null;
- for(int i = 0; i < docs.length; i++) {
- doc = new Document();
- String docText = docs[i];
- doc.add(newField(PayloadHelper.FIELD,docText, Field.Store.YES, Field.Index.ANALYZED));
- writer.addDocument(doc);
- }
-
- closeIndexReader = writer.getReader();
- writer.close();
-
- IndexSearcher searcher = newSearcher(closeIndexReader);
- return searcher;
- }
-
- private void checkSpans(Spans spans, int numSpans, int[] numPayloads) throws IOException {
- int cnt = 0;
-
- while (spans.next() == true) {
- if(VERBOSE)
- System.out.println("\nSpans Dump --");
- if (spans.isPayloadAvailable()) {
- Collection<byte[]> payload = spans.getPayload();
- if(VERBOSE)
- System.out.println("payloads for span:" + payload.size());
- for (final byte [] bytes : payload) {
- if(VERBOSE)
- System.out.println("doc:" + spans.doc() + " s:" + spans.start() + " e:" + spans.end() + " "
- + new String(bytes));
- }
-
- assertEquals(numPayloads[cnt],payload.size());
- } else {
- assertFalse("Expected spans:" + numPayloads[cnt] + " found: 0",numPayloads.length > 0 && numPayloads[cnt] > 0 );
- }
- cnt++;
- }
-
- assertEquals(numSpans, cnt);
- }
-
- final class PayloadAnalyzer extends Analyzer {
-
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new LowerCaseTokenizer(TEST_VERSION_CURRENT, reader);
- result = new PayloadFilter(result, fieldName);
- return result;
- }
- }
-
- final class PayloadFilter extends TokenFilter {
- String fieldName;
- int numSeen = 0;
- Set<String> entities = new HashSet<String>();
- Set<String> nopayload = new HashSet<String>();
- int pos;
- PayloadAttribute payloadAtt;
- CharTermAttribute termAtt;
- PositionIncrementAttribute posIncrAtt;
-
- public PayloadFilter(TokenStream input, String fieldName) {
- super(input);
- this.fieldName = fieldName;
- pos = 0;
- entities.add("xx");
- entities.add("one");
- nopayload.add("nopayload");
- nopayload.add("np");
- termAtt = addAttribute(CharTermAttribute.class);
- posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- payloadAtt = addAttribute(PayloadAttribute.class);
- }
-
- @Override
- public boolean incrementToken() throws IOException {
- if (input.incrementToken()) {
- String token = termAtt.toString();
-
- if (!nopayload.contains(token)) {
- if (entities.contains(token)) {
- payloadAtt.setPayload(new Payload((token + ":Entity:"+ pos ).getBytes()));
- } else {
- payloadAtt.setPayload(new Payload((token + ":Noise:" + pos ).getBytes()));
- }
- }
- pos += posIncrAtt.getPositionIncrement();
- return true;
- }
- return false;
- }
- }
-
- public final class TestPayloadAnalyzer extends Analyzer {
-
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new LowerCaseTokenizer(TEST_VERSION_CURRENT, reader);
- result = new PayloadFilter(result, fieldName);
- return result;
- }
- }
-}