1 package org.apache.lucene.index;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 import java.io.IOException;
22 import java.io.Reader;
23 import java.io.UnsupportedEncodingException;
24 import java.util.ArrayList;
25 import java.util.HashMap;
26 import java.util.List;
29 import org.apache.lucene.analysis.Analyzer;
30 import org.apache.lucene.analysis.MockAnalyzer;
31 import org.apache.lucene.analysis.MockTokenizer;
32 import org.apache.lucene.analysis.TokenFilter;
33 import org.apache.lucene.analysis.TokenStream;
34 import org.apache.lucene.analysis.WhitespaceTokenizer;
35 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
36 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
37 import org.apache.lucene.document.Document;
38 import org.apache.lucene.document.Field;
39 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
40 import org.apache.lucene.store.Directory;
41 import org.apache.lucene.util.LuceneTestCase;
42 import org.apache.lucene.util._TestUtil;
45 public class TestPayloads extends LuceneTestCase {
47 // Simple tests to test the Payload class
48 public void testPayload() throws Exception {
49 byte[] testData = "This is a test!".getBytes();
50 Payload payload = new Payload(testData);
51 assertEquals("Wrong payload length.", testData.length, payload.length());
54 byte[] target = new byte[testData.length - 1];
56 payload.copyTo(target, 0);
57 fail("Expected exception not thrown");
58 } catch (Exception expected) {
62 target = new byte[testData.length + 3];
63 payload.copyTo(target, 3);
65 for (int i = 0; i < testData.length; i++) {
66 assertEquals(testData[i], target[i + 3]);
71 target = payload.toByteArray();
72 assertByteArrayEquals(testData, target);
75 for (int i = 0; i < testData.length; i++) {
76 assertEquals(payload.byteAt(i), testData[i]);
80 payload.byteAt(testData.length + 1);
81 fail("Expected exception not thrown");
82 } catch (Exception expected) {
86 Payload clone = (Payload) payload.clone();
87 assertEquals(payload.length(), clone.length());
88 for (int i = 0; i < payload.length(); i++) {
89 assertEquals(payload.byteAt(i), clone.byteAt(i));
94 // Tests whether the DocumentWriter and SegmentMerger correctly enable the
95 // payload bit in the FieldInfo
96 public void testPayloadFieldBit() throws Exception {
97 Directory ram = newDirectory();
98 PayloadAnalyzer analyzer = new PayloadAnalyzer();
99 IndexWriter writer = new IndexWriter(ram, newIndexWriterConfig( TEST_VERSION_CURRENT, analyzer));
100 Document d = new Document();
101 // this field won't have any payloads
102 d.add(newField("f1", "This field has no payloads", Field.Store.NO, Field.Index.ANALYZED));
103 // this field will have payloads in all docs, however not for all term positions,
104 // so this field is used to check if the DocumentWriter correctly enables the payloads bit
105 // even if only some term positions have payloads
106 d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
107 d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
108 // this field is used to verify if the SegmentMerger enables payloads for a field if it has payloads
109 // enabled in only some documents
110 d.add(newField("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.ANALYZED));
111 // only add payload data for field f2
112 analyzer.setPayloadData("f2", 1, "somedata".getBytes(), 0, 1);
113 writer.addDocument(d);
117 SegmentReader reader = SegmentReader.getOnlySegmentReader(ram);
118 FieldInfos fi = reader.fieldInfos();
119 assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
120 assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
121 assertFalse("Payload field bit should not be set.", fi.fieldInfo("f3").storePayloads);
124 // now we add another document which has payloads for field f3 and verify if the SegmentMerger
125 // enabled payloads for that field
126 writer = new IndexWriter(ram, newIndexWriterConfig( TEST_VERSION_CURRENT,
127 analyzer).setOpenMode(OpenMode.CREATE));
129 d.add(newField("f1", "This field has no payloads", Field.Store.NO, Field.Index.ANALYZED));
130 d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
131 d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
132 d.add(newField("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.ANALYZED));
133 // add payload data for field f2 and f3
134 analyzer.setPayloadData("f2", "somedata".getBytes(), 0, 1);
135 analyzer.setPayloadData("f3", "somedata".getBytes(), 0, 3);
136 writer.addDocument(d);
142 reader = SegmentReader.getOnlySegmentReader(ram);
143 fi = reader.fieldInfos();
144 assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
145 assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
146 assertTrue("Payload field bit should be set.", fi.fieldInfo("f3").storePayloads);
151 // Tests if payloads are correctly stored and loaded using both RamDirectory and FSDirectory
152 public void testPayloadsEncoding() throws Exception {
153 // first perform the test using a RAMDirectory
154 Directory dir = newDirectory();
157 // now use a FSDirectory and repeat same test
158 File dirName = _TestUtil.getTempDir("test_payloads");
159 dir = newFSDirectory(dirName);
161 _TestUtil.rmDir(dirName);
165 // builds an index with payloads in the given Directory and performs
166 // different tests to verify the payload encoding
167 private void performTest(Directory dir) throws Exception {
168 PayloadAnalyzer analyzer = new PayloadAnalyzer();
169 IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(
170 TEST_VERSION_CURRENT, analyzer)
171 .setOpenMode(OpenMode.CREATE)
172 .setMergePolicy(newLogMergePolicy()));
174 // should be in sync with value in TermInfosWriter
175 final int skipInterval = 16;
177 final int numTerms = 5;
178 final String fieldName = "f1";
180 int numDocs = skipInterval + 1;
181 // create content for the test documents with just a few terms
182 Term[] terms = generateTerms(fieldName, numTerms);
183 StringBuilder sb = new StringBuilder();
184 for (int i = 0; i < terms.length; i++) {
185 sb.append(terms[i].text);
188 String content = sb.toString();
191 int payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2;
192 byte[] payloadData = generateRandomData(payloadDataLength);
194 Document d = new Document();
195 d.add(newField(fieldName, content, Field.Store.NO, Field.Index.ANALYZED));
196 // add the same document multiple times to have the same payload lengths for all
197 // occurrences within two consecutive skip intervals
199 for (int i = 0; i < 2 * numDocs; i++) {
200 analyzer.setPayloadData(fieldName, payloadData, offset, 1);
202 writer.addDocument(d);
205 // make sure we create more than one segment to test merging
208 // now we make sure to have different payload lengths next at the next skip point
209 for (int i = 0; i < numDocs; i++) {
210 analyzer.setPayloadData(fieldName, payloadData, offset, i);
211 offset += i * numTerms;
212 writer.addDocument(d);
222 * first we test if all payloads are stored correctly
224 IndexReader reader = IndexReader.open(dir, true);
226 byte[] verifyPayloadData = new byte[payloadDataLength];
228 TermPositions[] tps = new TermPositions[numTerms];
229 for (int i = 0; i < numTerms; i++) {
230 tps[i] = reader.termPositions(terms[i]);
233 while (tps[0].next()) {
234 for (int i = 1; i < numTerms; i++) {
237 int freq = tps[0].freq();
239 for (int i = 0; i < freq; i++) {
240 for (int j = 0; j < numTerms; j++) {
241 tps[j].nextPosition();
242 if (tps[j].isPayloadAvailable()) {
243 tps[j].getPayload(verifyPayloadData, offset);
244 offset += tps[j].getPayloadLength();
250 for (int i = 0; i < numTerms; i++) {
254 assertByteArrayEquals(payloadData, verifyPayloadData);
259 TermPositions tp = reader.termPositions(terms[0]);
262 // now we don't read this payload
264 assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
265 byte[] payload = tp.getPayload(null, 0);
266 assertEquals(payload[0], payloadData[numTerms]);
269 // we don't read this payload and skip to a different document
272 assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
273 payload = tp.getPayload(null, 0);
274 assertEquals(payload[0], payloadData[5 * numTerms]);
278 * Test different lengths at skip points
283 assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
284 tp.skipTo(skipInterval - 1);
286 assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
287 tp.skipTo(2 * skipInterval - 1);
289 assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
290 tp.skipTo(3 * skipInterval - 1);
292 assertEquals("Wrong payload length.", 3 * skipInterval - 2 * numDocs - 1, tp.getPayloadLength());
295 * Test multiple call of getPayload()
297 tp.getPayload(null, 0);
299 // it is forbidden to call getPayload() more than once
300 // without calling nextPosition()
301 tp.getPayload(null, 0);
302 fail("Expected exception not thrown");
303 } catch (Exception expected) {
304 // expected exception
310 analyzer = new PayloadAnalyzer();
311 writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT,
312 analyzer).setOpenMode(OpenMode.CREATE));
313 String singleTerm = "lucene";
316 d.add(newField(fieldName, singleTerm, Field.Store.NO, Field.Index.ANALYZED));
317 // add a payload whose length is greater than the buffer size of BufferedIndexOutput
318 payloadData = generateRandomData(2000);
319 analyzer.setPayloadData(fieldName, payloadData, 100, 1500);
320 writer.addDocument(d);
327 reader = IndexReader.open(dir, true);
328 tp = reader.termPositions(new Term(fieldName, singleTerm));
332 verifyPayloadData = new byte[tp.getPayloadLength()];
333 tp.getPayload(verifyPayloadData, 0);
334 byte[] portion = new byte[1500];
335 System.arraycopy(payloadData, 100, portion, 0, 1500);
337 assertByteArrayEquals(portion, verifyPayloadData);
342 private void generateRandomData(byte[] data) {
343 // this test needs the random data to be valid unicode
344 String s = _TestUtil.randomFixedByteLengthUnicodeString(random, data.length);
347 b = s.getBytes("UTF-8");
348 } catch (UnsupportedEncodingException e) {
349 throw new RuntimeException(e);
351 assert b.length == data.length;
352 System.arraycopy(b, 0, data, 0, b.length);
355 private byte[] generateRandomData(int n) {
356 byte[] data = new byte[n];
357 generateRandomData(data);
361 private Term[] generateTerms(String fieldName, int n) {
362 int maxDigits = (int) (Math.log(n) / Math.log(10));
363 Term[] terms = new Term[n];
364 StringBuilder sb = new StringBuilder();
365 for (int i = 0; i < n; i++) {
368 int zeros = maxDigits - (int) (Math.log(i) / Math.log(10));
369 for (int j = 0; j < zeros; j++) {
373 terms[i] = new Term(fieldName, sb.toString());
379 void assertByteArrayEquals(byte[] b1, byte[] b2) {
380 if (b1.length != b2.length) {
381 fail("Byte arrays have different lengths: " + b1.length + ", " + b2.length);
384 for (int i = 0; i < b1.length; i++) {
385 if (b1[i] != b2[i]) {
386 fail("Byte arrays different at index " + i + ": " + b1[i] + ", " + b2[i]);
393 * This Analyzer uses an WhitespaceTokenizer and PayloadFilter.
395 private static class PayloadAnalyzer extends Analyzer {
396 Map<String,PayloadData> fieldToData = new HashMap<String,PayloadData>();
398 void setPayloadData(String field, byte[] data, int offset, int length) {
399 fieldToData.put(field, new PayloadData(0, data, offset, length));
402 void setPayloadData(String field, int numFieldInstancesToSkip, byte[] data, int offset, int length) {
403 fieldToData.put(field, new PayloadData(numFieldInstancesToSkip, data, offset, length));
407 public TokenStream tokenStream(String fieldName, Reader reader) {
408 PayloadData payload = fieldToData.get(fieldName);
409 TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
410 if (payload != null) {
411 if (payload.numFieldInstancesToSkip == 0) {
412 ts = new PayloadFilter(ts, payload.data, payload.offset, payload.length);
414 payload.numFieldInstancesToSkip--;
420 private static class PayloadData {
424 int numFieldInstancesToSkip;
426 PayloadData(int skip, byte[] data, int offset, int length) {
427 numFieldInstancesToSkip = skip;
429 this.offset = offset;
430 this.length = length;
437 * This Filter adds payloads to the tokens.
439 private static class PayloadFilter extends TokenFilter {
443 private int startOffset;
444 PayloadAttribute payloadAtt;
446 public PayloadFilter(TokenStream in, byte[] data, int offset, int length) {
449 this.length = length;
450 this.offset = offset;
451 this.startOffset = offset;
452 payloadAtt = addAttribute(PayloadAttribute.class);
456 public boolean incrementToken() throws IOException {
457 boolean hasNext = input.incrementToken();
459 if (offset + length <= data.length) {
460 Payload p = new Payload();
461 payloadAtt.setPayload(p);
462 p.setData(data, offset, length);
465 payloadAtt.setPayload(null);
473 public void reset() throws IOException {
475 this.offset = startOffset;
479 public void testThreadSafety() throws Exception {
480 final int numThreads = 5;
481 final int numDocs = atLeast(50);
482 final ByteArrayPool pool = new ByteArrayPool(numThreads, 5);
484 Directory dir = newDirectory();
485 final IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(
486 TEST_VERSION_CURRENT, new MockAnalyzer(random)));
487 final String field = "test";
489 Thread[] ingesters = new Thread[numThreads];
490 for (int i = 0; i < numThreads; i++) {
491 ingesters[i] = new Thread() {
495 for (int j = 0; j < numDocs; j++) {
496 Document d = new Document();
497 d.add(new Field(field, new PoolingPayloadTokenStream(pool)));
498 writer.addDocument(d);
500 } catch (Exception e) {
506 ingesters[i].start();
509 for (int i = 0; i < numThreads; i++) {
513 IndexReader reader = IndexReader.open(dir, true);
514 TermEnum terms = reader.terms();
515 while (terms.next()) {
516 TermPositions tp = reader.termPositions(terms.term());
518 int freq = tp.freq();
519 for (int i = 0; i < freq; i++) {
521 byte payload[] = new byte[5];
522 tp.getPayload(payload, 0);
523 assertEquals(terms.term().text, new String(payload, 0, payload.length, "UTF-8"));
531 assertEquals(pool.size(), numThreads);
534 private class PoolingPayloadTokenStream extends TokenStream {
535 private byte[] payload;
536 private boolean first;
537 private ByteArrayPool pool;
540 CharTermAttribute termAtt;
541 PayloadAttribute payloadAtt;
543 PoolingPayloadTokenStream(ByteArrayPool pool) {
545 payload = pool.get();
546 generateRandomData(payload);
548 term = new String(payload, 0, payload.length, "UTF-8");
549 } catch (UnsupportedEncodingException e) {
550 throw new RuntimeException(e);
553 payloadAtt = addAttribute(PayloadAttribute.class);
554 termAtt = addAttribute(CharTermAttribute.class);
558 public boolean incrementToken() throws IOException {
559 if (!first) return false;
562 termAtt.append(term);
563 payloadAtt.setPayload(new Payload(payload));
568 public void close() throws IOException {
569 pool.release(payload);
574 private static class ByteArrayPool {
575 private List<byte[]> pool;
577 ByteArrayPool(int capacity, int size) {
578 pool = new ArrayList<byte[]>();
579 for (int i = 0; i < capacity; i++) {
580 pool.add(new byte[size]);
584 synchronized byte[] get() {
585 return pool.remove(0);
588 synchronized void release(byte[] b) {
592 synchronized int size() {
597 public void testAcrossFields() throws Exception {
598 Directory dir = newDirectory();
599 RandomIndexWriter writer = new RandomIndexWriter(random, dir,
600 new MockAnalyzer(random, MockTokenizer.WHITESPACE, true));
601 Document doc = new Document();
602 doc.add(new Field("hasMaybepayload", "here we go", Field.Store.YES, Field.Index.ANALYZED));
603 writer.addDocument(doc);
606 writer = new RandomIndexWriter(random, dir,
607 new MockAnalyzer(random, MockTokenizer.WHITESPACE, true));
608 doc = new Document();
609 doc.add(new Field("hasMaybepayload2", "here we go", Field.Store.YES, Field.Index.ANALYZED));
610 writer.addDocument(doc);
611 writer.addDocument(doc);