1 package org.apache.lucene.search.spans;
4 * Copyright 2004 The Apache Software Foundation
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
19 import java.io.IOException;
20 import java.io.Reader;
21 import java.io.StringReader;
22 import java.util.Collection;
23 import java.util.HashSet;
26 import org.apache.lucene.analysis.Analyzer;
27 import org.apache.lucene.analysis.LowerCaseTokenizer;
28 import org.apache.lucene.analysis.TokenFilter;
29 import org.apache.lucene.analysis.TokenStream;
30 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
31 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
32 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
33 import org.apache.lucene.document.Document;
34 import org.apache.lucene.document.Field;
35 import org.apache.lucene.index.CorruptIndexException;
36 import org.apache.lucene.index.RandomIndexWriter;
37 import org.apache.lucene.index.IndexReader;
38 import org.apache.lucene.index.Payload;
39 import org.apache.lucene.index.Term;
40 import org.apache.lucene.search.DefaultSimilarity;
41 import org.apache.lucene.search.IndexSearcher;
42 import org.apache.lucene.search.Similarity;
43 import org.apache.lucene.search.TermQuery;
44 import org.apache.lucene.search.TopDocs;
45 import org.apache.lucene.search.payloads.PayloadHelper;
46 import org.apache.lucene.search.payloads.PayloadSpanUtil;
47 import org.apache.lucene.store.Directory;
48 import org.apache.lucene.store.LockObtainFailedException;
49 import org.apache.lucene.util.LuceneTestCase;
51 public class TestPayloadSpans extends LuceneTestCase {
52 private IndexSearcher searcher;
53 private Similarity similarity = new DefaultSimilarity();
54 protected IndexReader indexReader;
55 private IndexReader closeIndexReader;
56 private Directory directory;
59 public void setUp() throws Exception {
61 PayloadHelper helper = new PayloadHelper();
62 searcher = helper.setUp(random, similarity, 1000);
63 indexReader = searcher.getIndexReader();
66 public void testSpanTermQuery() throws Exception {
69 stq = new SpanTermQuery(new Term(PayloadHelper.FIELD, "seventy"));
70 spans = stq.getSpans(indexReader);
71 assertTrue("spans is null and it shouldn't be", spans != null);
72 checkSpans(spans, 100, 1, 1, 1);
74 stq = new SpanTermQuery(new Term(PayloadHelper.NO_PAYLOAD_FIELD, "seventy"));
75 spans = stq.getSpans(indexReader);
76 assertTrue("spans is null and it shouldn't be", spans != null);
77 checkSpans(spans, 100, 0, 0, 0);
80 public void testSpanFirst() throws IOException {
84 match = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one"));
85 sfq = new SpanFirstQuery(match, 2);
86 Spans spans = sfq.getSpans(indexReader);
87 checkSpans(spans, 109, 1, 1, 1);
88 //Test more complicated subclause
89 SpanQuery[] clauses = new SpanQuery[2];
90 clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one"));
91 clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "hundred"));
92 match = new SpanNearQuery(clauses, 0, true);
93 sfq = new SpanFirstQuery(match, 2);
94 checkSpans(sfq.getSpans(indexReader), 100, 2, 1, 1);
96 match = new SpanNearQuery(clauses, 0, false);
97 sfq = new SpanFirstQuery(match, 2);
98 checkSpans(sfq.getSpans(indexReader), 100, 2, 1, 1);
102 public void testSpanNot() throws Exception {
103 SpanQuery[] clauses = new SpanQuery[2];
104 clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one"));
105 clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "three"));
106 SpanQuery spq = new SpanNearQuery(clauses, 5, true);
107 SpanNotQuery snq = new SpanNotQuery(spq, new SpanTermQuery(new Term(PayloadHelper.FIELD, "two")));
111 Directory directory = newDirectory();
112 RandomIndexWriter writer = new RandomIndexWriter(random, directory,
113 newIndexWriterConfig(TEST_VERSION_CURRENT, new PayloadAnalyzer()).setSimilarity(similarity));
115 Document doc = new Document();
116 doc.add(newField(PayloadHelper.FIELD, "one two three one four three",
117 Field.Store.YES, Field.Index.ANALYZED));
118 writer.addDocument(doc);
119 IndexReader reader = writer.getReader();
122 checkSpans(snq.getSpans(reader), 1,new int[]{2});
127 public void testNestedSpans() throws Exception {
130 IndexSearcher searcher = getSearcher();
131 stq = new SpanTermQuery(new Term(PayloadHelper.FIELD, "mark"));
132 spans = stq.getSpans(searcher.getIndexReader());
133 assertTrue("spans is null and it shouldn't be", spans != null);
134 checkSpans(spans, 0, null);
137 SpanQuery[] clauses = new SpanQuery[3];
138 clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "rr"));
139 clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "yy"));
140 clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "xx"));
141 SpanNearQuery spanNearQuery = new SpanNearQuery(clauses, 12, false);
143 spans = spanNearQuery.getSpans(searcher.getIndexReader());
144 assertTrue("spans is null and it shouldn't be", spans != null);
145 checkSpans(spans, 2, new int[]{3,3});
148 clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "xx"));
149 clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "rr"));
150 clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "yy"));
152 spanNearQuery = new SpanNearQuery(clauses, 6, true);
155 spans = spanNearQuery.getSpans(searcher.getIndexReader());
156 assertTrue("spans is null and it shouldn't be", spans != null);
157 checkSpans(spans, 1, new int[]{3});
159 clauses = new SpanQuery[2];
161 clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "xx"));
162 clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "rr"));
164 spanNearQuery = new SpanNearQuery(clauses, 6, true);
168 SpanQuery[] clauses2 = new SpanQuery[2];
170 clauses2[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "yy"));
171 clauses2[1] = spanNearQuery;
173 SpanNearQuery nestedSpanNearQuery = new SpanNearQuery(clauses2, 6, false);
175 // yy within 6 of xx within 6 of rr
177 spans = nestedSpanNearQuery.getSpans(searcher.getIndexReader());
178 assertTrue("spans is null and it shouldn't be", spans != null);
179 checkSpans(spans, 2, new int[]{3,3});
181 closeIndexReader.close();
185 public void testFirstClauseWithoutPayload() throws Exception {
187 IndexSearcher searcher = getSearcher();
189 SpanQuery[] clauses = new SpanQuery[3];
190 clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "nopayload"));
191 clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "qq"));
192 clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "ss"));
194 SpanNearQuery spanNearQuery = new SpanNearQuery(clauses, 6, true);
196 SpanQuery[] clauses2 = new SpanQuery[2];
198 clauses2[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "pp"));
199 clauses2[1] = spanNearQuery;
201 SpanNearQuery snq = new SpanNearQuery(clauses2, 6, false);
203 SpanQuery[] clauses3 = new SpanQuery[2];
205 clauses3[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "np"));
208 SpanNearQuery nestedSpanNearQuery = new SpanNearQuery(clauses3, 6, false);
210 spans = nestedSpanNearQuery.getSpans(searcher.getIndexReader());
211 assertTrue("spans is null and it shouldn't be", spans != null);
212 checkSpans(spans, 1, new int[]{3});
214 closeIndexReader.close();
218 public void testHeavilyNestedSpanQuery() throws Exception {
220 IndexSearcher searcher = getSearcher();
222 SpanQuery[] clauses = new SpanQuery[3];
223 clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one"));
224 clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "two"));
225 clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "three"));
227 SpanNearQuery spanNearQuery = new SpanNearQuery(clauses, 5, true);
229 clauses = new SpanQuery[3];
230 clauses[0] = spanNearQuery;
231 clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "five"));
232 clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "six"));
234 SpanNearQuery spanNearQuery2 = new SpanNearQuery(clauses, 6, true);
236 SpanQuery[] clauses2 = new SpanQuery[2];
237 clauses2[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "eleven"));
238 clauses2[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "ten"));
239 SpanNearQuery spanNearQuery3 = new SpanNearQuery(clauses2, 2, false);
241 SpanQuery[] clauses3 = new SpanQuery[3];
242 clauses3[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "nine"));
243 clauses3[1] = spanNearQuery2;
244 clauses3[2] = spanNearQuery3;
246 SpanNearQuery nestedSpanNearQuery = new SpanNearQuery(clauses3, 6, false);
248 spans = nestedSpanNearQuery.getSpans(searcher.getIndexReader());
249 assertTrue("spans is null and it shouldn't be", spans != null);
250 checkSpans(spans, 2, new int[]{8, 8});
252 closeIndexReader.close();
256 public void testShrinkToAfterShortestMatch() throws CorruptIndexException,
257 LockObtainFailedException, IOException {
258 Directory directory = newDirectory();
259 RandomIndexWriter writer = new RandomIndexWriter(random, directory,
260 newIndexWriterConfig(TEST_VERSION_CURRENT, new TestPayloadAnalyzer()));
262 Document doc = new Document();
263 doc.add(new Field("content", new StringReader("a b c d e f g h i j a k")));
264 writer.addDocument(doc);
266 IndexReader reader = writer.getReader();
267 IndexSearcher is = newSearcher(reader);
270 SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
271 SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
272 SpanQuery[] sqs = { stq1, stq2 };
273 SpanNearQuery snq = new SpanNearQuery(sqs, 1, true);
274 Spans spans = snq.getSpans(is.getIndexReader());
276 TopDocs topDocs = is.search(snq, 1);
277 Set<String> payloadSet = new HashSet<String>();
278 for (int i = 0; i < topDocs.scoreDocs.length; i++) {
279 while (spans.next()) {
280 Collection<byte[]> payloads = spans.getPayload();
282 for (final byte [] payload : payloads) {
283 payloadSet.add(new String(payload));
287 assertEquals(2, payloadSet.size());
288 assertTrue(payloadSet.contains("a:Noise:10"));
289 assertTrue(payloadSet.contains("k:Noise:11"));
295 public void testShrinkToAfterShortestMatch2() throws CorruptIndexException,
296 LockObtainFailedException, IOException {
297 Directory directory = newDirectory();
298 RandomIndexWriter writer = new RandomIndexWriter(random, directory,
299 newIndexWriterConfig(TEST_VERSION_CURRENT, new TestPayloadAnalyzer()));
301 Document doc = new Document();
302 doc.add(new Field("content", new StringReader("a b a d k f a h i k a k")));
303 writer.addDocument(doc);
304 IndexReader reader = writer.getReader();
305 IndexSearcher is = newSearcher(reader);
308 SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
309 SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
310 SpanQuery[] sqs = { stq1, stq2 };
311 SpanNearQuery snq = new SpanNearQuery(sqs, 0, true);
312 Spans spans = snq.getSpans(is.getIndexReader());
314 TopDocs topDocs = is.search(snq, 1);
315 Set<String> payloadSet = new HashSet<String>();
316 for (int i = 0; i < topDocs.scoreDocs.length; i++) {
317 while (spans.next()) {
318 Collection<byte[]> payloads = spans.getPayload();
319 for (final byte[] payload : payloads) {
320 payloadSet.add(new String(payload));
324 assertEquals(2, payloadSet.size());
325 assertTrue(payloadSet.contains("a:Noise:10"));
326 assertTrue(payloadSet.contains("k:Noise:11"));
332 public void testShrinkToAfterShortestMatch3() throws CorruptIndexException,
333 LockObtainFailedException, IOException {
334 Directory directory = newDirectory();
335 RandomIndexWriter writer = new RandomIndexWriter(random, directory,
336 newIndexWriterConfig(TEST_VERSION_CURRENT, new TestPayloadAnalyzer()));
338 Document doc = new Document();
339 doc.add(new Field("content", new StringReader("j k a l f k k p a t a k l k t a")));
340 writer.addDocument(doc);
341 IndexReader reader = writer.getReader();
342 IndexSearcher is = newSearcher(reader);
345 SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
346 SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
347 SpanQuery[] sqs = { stq1, stq2 };
348 SpanNearQuery snq = new SpanNearQuery(sqs, 0, true);
349 Spans spans = snq.getSpans(is.getIndexReader());
351 TopDocs topDocs = is.search(snq, 1);
352 Set<String> payloadSet = new HashSet<String>();
353 for (int i = 0; i < topDocs.scoreDocs.length; i++) {
354 while (spans.next()) {
355 Collection<byte[]> payloads = spans.getPayload();
357 for (final byte [] payload : payloads) {
358 payloadSet.add(new String(payload));
362 assertEquals(2, payloadSet.size());
364 for (final String payload : payloadSet)
365 System.out.println("match:" + payload);
368 assertTrue(payloadSet.contains("a:Noise:10"));
369 assertTrue(payloadSet.contains("k:Noise:11"));
375 public void testPayloadSpanUtil() throws Exception {
376 Directory directory = newDirectory();
377 RandomIndexWriter writer = new RandomIndexWriter(random, directory,
378 newIndexWriterConfig(TEST_VERSION_CURRENT, new PayloadAnalyzer()).setSimilarity(similarity));
380 Document doc = new Document();
381 doc.add(newField(PayloadHelper.FIELD,"xx rr yy mm pp", Field.Store.YES, Field.Index.ANALYZED));
382 writer.addDocument(doc);
384 IndexReader reader = writer.getReader();
386 IndexSearcher searcher = newSearcher(reader);
388 PayloadSpanUtil psu = new PayloadSpanUtil(searcher.getIndexReader());
390 Collection<byte[]> payloads = psu.getPayloadsForQuery(new TermQuery(new Term(PayloadHelper.FIELD, "rr")));
392 System.out.println("Num payloads:" + payloads.size());
393 for (final byte [] bytes : payloads) {
395 System.out.println(new String(bytes));
402 private void checkSpans(Spans spans, int expectedNumSpans, int expectedNumPayloads,
403 int expectedPayloadLength, int expectedFirstByte) throws IOException {
404 assertTrue("spans is null and it shouldn't be", spans != null);
405 //each position match should have a span associated with it, since there is just one underlying term query, there should
406 //only be one entry in the span
408 while (spans.next() == true)
410 //if we expect payloads, then isPayloadAvailable should be true
411 if (expectedNumPayloads > 0) {
412 assertTrue("isPayloadAvailable is not returning the correct value: " + spans.isPayloadAvailable()
413 + " and it should be: " + (expectedNumPayloads > 0),
414 spans.isPayloadAvailable() == true);
416 assertTrue("isPayloadAvailable should be false", spans.isPayloadAvailable() == false);
418 //See payload helper, for the PayloadHelper.FIELD field, there is a single byte payload at every token
419 if (spans.isPayloadAvailable()) {
420 Collection<byte[]> payload = spans.getPayload();
421 assertTrue("payload Size: " + payload.size() + " is not: " + expectedNumPayloads, payload.size() == expectedNumPayloads);
422 for (final byte [] thePayload : payload) {
423 assertTrue("payload[0] Size: " + thePayload.length + " is not: " + expectedPayloadLength,
424 thePayload.length == expectedPayloadLength);
425 assertTrue(thePayload[0] + " does not equal: " + expectedFirstByte, thePayload[0] == expectedFirstByte);
432 assertTrue(seen + " does not equal: " + expectedNumSpans, seen == expectedNumSpans);
435 private IndexSearcher getSearcher() throws Exception {
436 directory = newDirectory();
437 String[] docs = new String[]{"xx rr yy mm pp","xx yy mm rr pp", "nopayload qq ss pp np", "one two three four five six seven eight nine ten eleven", "nine one two three four five six seven eight eleven ten"};
438 RandomIndexWriter writer = new RandomIndexWriter(random, directory,
439 newIndexWriterConfig(TEST_VERSION_CURRENT, new PayloadAnalyzer()).setSimilarity(similarity));
442 for(int i = 0; i < docs.length; i++) {
443 doc = new Document();
444 String docText = docs[i];
445 doc.add(newField(PayloadHelper.FIELD,docText, Field.Store.YES, Field.Index.ANALYZED));
446 writer.addDocument(doc);
449 closeIndexReader = writer.getReader();
452 IndexSearcher searcher = newSearcher(closeIndexReader);
456 private void checkSpans(Spans spans, int numSpans, int[] numPayloads) throws IOException {
459 while (spans.next() == true) {
461 System.out.println("\nSpans Dump --");
462 if (spans.isPayloadAvailable()) {
463 Collection<byte[]> payload = spans.getPayload();
465 System.out.println("payloads for span:" + payload.size());
466 for (final byte [] bytes : payload) {
468 System.out.println("doc:" + spans.doc() + " s:" + spans.start() + " e:" + spans.end() + " "
469 + new String(bytes));
472 assertEquals(numPayloads[cnt],payload.size());
474 assertFalse("Expected spans:" + numPayloads[cnt] + " found: 0",numPayloads.length > 0 && numPayloads[cnt] > 0 );
479 assertEquals(numSpans, cnt);
482 final class PayloadAnalyzer extends Analyzer {
485 public TokenStream tokenStream(String fieldName, Reader reader) {
486 TokenStream result = new LowerCaseTokenizer(TEST_VERSION_CURRENT, reader);
487 result = new PayloadFilter(result, fieldName);
492 final class PayloadFilter extends TokenFilter {
495 Set<String> entities = new HashSet<String>();
496 Set<String> nopayload = new HashSet<String>();
498 PayloadAttribute payloadAtt;
499 CharTermAttribute termAtt;
500 PositionIncrementAttribute posIncrAtt;
502 public PayloadFilter(TokenStream input, String fieldName) {
504 this.fieldName = fieldName;
508 nopayload.add("nopayload");
510 termAtt = addAttribute(CharTermAttribute.class);
511 posIncrAtt = addAttribute(PositionIncrementAttribute.class);
512 payloadAtt = addAttribute(PayloadAttribute.class);
516 public boolean incrementToken() throws IOException {
517 if (input.incrementToken()) {
518 String token = termAtt.toString();
520 if (!nopayload.contains(token)) {
521 if (entities.contains(token)) {
522 payloadAtt.setPayload(new Payload((token + ":Entity:"+ pos ).getBytes()));
524 payloadAtt.setPayload(new Payload((token + ":Noise:" + pos ).getBytes()));
527 pos += posIncrAtt.getPositionIncrement();
534 public final class TestPayloadAnalyzer extends Analyzer {
537 public TokenStream tokenStream(String fieldName, Reader reader) {
538 TokenStream result = new LowerCaseTokenizer(TEST_VERSION_CURRENT, reader);
539 result = new PayloadFilter(result, fieldName);