1 package org.apache.lucene.index;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 import java.io.IOException;
22 import java.io.Reader;
23 import java.io.UnsupportedEncodingException;
24 import java.util.ArrayList;
25 import java.util.HashMap;
26 import java.util.List;
29 import org.apache.lucene.analysis.Analyzer;
30 import org.apache.lucene.analysis.MockAnalyzer;
31 import org.apache.lucene.analysis.MockTokenizer;
32 import org.apache.lucene.analysis.TokenFilter;
33 import org.apache.lucene.analysis.TokenStream;
34 import org.apache.lucene.analysis.WhitespaceTokenizer;
35 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
36 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
37 import org.apache.lucene.document.Document;
38 import org.apache.lucene.document.Field;
39 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
40 import org.apache.lucene.store.Directory;
41 import org.apache.lucene.util.LuceneTestCase;
42 import org.apache.lucene.util.UnicodeUtil;
43 import org.apache.lucene.util._TestUtil;
46 public class TestPayloads extends LuceneTestCase {
48 // Simple tests to test the Payload class
49 public void testPayload() throws Exception {
50 byte[] testData = "This is a test!".getBytes();
51 Payload payload = new Payload(testData);
52 assertEquals("Wrong payload length.", testData.length, payload.length());
55 byte[] target = new byte[testData.length - 1];
57 payload.copyTo(target, 0);
58 fail("Expected exception not thrown");
59 } catch (Exception expected) {
63 target = new byte[testData.length + 3];
64 payload.copyTo(target, 3);
66 for (int i = 0; i < testData.length; i++) {
67 assertEquals(testData[i], target[i + 3]);
72 target = payload.toByteArray();
73 assertByteArrayEquals(testData, target);
76 for (int i = 0; i < testData.length; i++) {
77 assertEquals(payload.byteAt(i), testData[i]);
81 payload.byteAt(testData.length + 1);
82 fail("Expected exception not thrown");
83 } catch (Exception expected) {
87 Payload clone = (Payload) payload.clone();
88 assertEquals(payload.length(), clone.length());
89 for (int i = 0; i < payload.length(); i++) {
90 assertEquals(payload.byteAt(i), clone.byteAt(i));
95 // Tests whether the DocumentWriter and SegmentMerger correctly enable the
96 // payload bit in the FieldInfo
97 public void testPayloadFieldBit() throws Exception {
98 Directory ram = newDirectory();
99 PayloadAnalyzer analyzer = new PayloadAnalyzer();
100 IndexWriter writer = new IndexWriter(ram, newIndexWriterConfig( TEST_VERSION_CURRENT, analyzer));
101 Document d = new Document();
102 // this field won't have any payloads
103 d.add(newField("f1", "This field has no payloads", Field.Store.NO, Field.Index.ANALYZED));
104 // this field will have payloads in all docs, however not for all term positions,
105 // so this field is used to check if the DocumentWriter correctly enables the payloads bit
106 // even if only some term positions have payloads
107 d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
108 d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
109 // this field is used to verify if the SegmentMerger enables payloads for a field if it has payloads
110 // enabled in only some documents
111 d.add(newField("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.ANALYZED));
112 // only add payload data for field f2
113 analyzer.setPayloadData("f2", 1, "somedata".getBytes(), 0, 1);
114 writer.addDocument(d);
118 SegmentReader reader = SegmentReader.getOnlySegmentReader(ram);
119 FieldInfos fi = reader.fieldInfos();
120 assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
121 assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
122 assertFalse("Payload field bit should not be set.", fi.fieldInfo("f3").storePayloads);
125 // now we add another document which has payloads for field f3 and verify if the SegmentMerger
126 // enabled payloads for that field
127 writer = new IndexWriter(ram, newIndexWriterConfig( TEST_VERSION_CURRENT,
128 analyzer).setOpenMode(OpenMode.CREATE));
130 d.add(newField("f1", "This field has no payloads", Field.Store.NO, Field.Index.ANALYZED));
131 d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
132 d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
133 d.add(newField("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.ANALYZED));
134 // add payload data for field f2 and f3
135 analyzer.setPayloadData("f2", "somedata".getBytes(), 0, 1);
136 analyzer.setPayloadData("f3", "somedata".getBytes(), 0, 3);
137 writer.addDocument(d);
143 reader = SegmentReader.getOnlySegmentReader(ram);
144 fi = reader.fieldInfos();
145 assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
146 assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
147 assertTrue("Payload field bit should be set.", fi.fieldInfo("f3").storePayloads);
152 // Tests if payloads are correctly stored and loaded using both RamDirectory and FSDirectory
153 public void testPayloadsEncoding() throws Exception {
154 // first perform the test using a RAMDirectory
155 Directory dir = newDirectory();
158 // now use a FSDirectory and repeat same test
159 File dirName = _TestUtil.getTempDir("test_payloads");
160 dir = newFSDirectory(dirName);
162 _TestUtil.rmDir(dirName);
166 // builds an index with payloads in the given Directory and performs
167 // different tests to verify the payload encoding
168 private void performTest(Directory dir) throws Exception {
169 PayloadAnalyzer analyzer = new PayloadAnalyzer();
170 IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(
171 TEST_VERSION_CURRENT, analyzer)
172 .setOpenMode(OpenMode.CREATE)
173 .setMergePolicy(newLogMergePolicy()));
175 // should be in sync with value in TermInfosWriter
176 final int skipInterval = 16;
178 final int numTerms = 5;
179 final String fieldName = "f1";
181 int numDocs = skipInterval + 1;
182 // create content for the test documents with just a few terms
183 Term[] terms = generateTerms(fieldName, numTerms);
184 StringBuilder sb = new StringBuilder();
185 for (int i = 0; i < terms.length; i++) {
186 sb.append(terms[i].text);
189 String content = sb.toString();
192 int payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2;
193 byte[] payloadData = generateRandomData(payloadDataLength);
195 Document d = new Document();
196 d.add(newField(fieldName, content, Field.Store.NO, Field.Index.ANALYZED));
197 // add the same document multiple times to have the same payload lengths for all
198 // occurrences within two consecutive skip intervals
200 for (int i = 0; i < 2 * numDocs; i++) {
201 analyzer.setPayloadData(fieldName, payloadData, offset, 1);
203 writer.addDocument(d);
206 // make sure we create more than one segment to test merging
209 // now we make sure to have different payload lengths next at the next skip point
210 for (int i = 0; i < numDocs; i++) {
211 analyzer.setPayloadData(fieldName, payloadData, offset, i);
212 offset += i * numTerms;
213 writer.addDocument(d);
223 * first we test if all payloads are stored correctly
225 IndexReader reader = IndexReader.open(dir, true);
227 byte[] verifyPayloadData = new byte[payloadDataLength];
229 TermPositions[] tps = new TermPositions[numTerms];
230 for (int i = 0; i < numTerms; i++) {
231 tps[i] = reader.termPositions(terms[i]);
234 while (tps[0].next()) {
235 for (int i = 1; i < numTerms; i++) {
238 int freq = tps[0].freq();
240 for (int i = 0; i < freq; i++) {
241 for (int j = 0; j < numTerms; j++) {
242 tps[j].nextPosition();
243 if (tps[j].isPayloadAvailable()) {
244 tps[j].getPayload(verifyPayloadData, offset);
245 offset += tps[j].getPayloadLength();
251 for (int i = 0; i < numTerms; i++) {
255 assertByteArrayEquals(payloadData, verifyPayloadData);
260 TermPositions tp = reader.termPositions(terms[0]);
263 // now we don't read this payload
265 assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
266 byte[] payload = tp.getPayload(null, 0);
267 assertEquals(payload[0], payloadData[numTerms]);
270 // we don't read this payload and skip to a different document
273 assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
274 payload = tp.getPayload(null, 0);
275 assertEquals(payload[0], payloadData[5 * numTerms]);
279 * Test different lengths at skip points
284 assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
285 tp.skipTo(skipInterval - 1);
287 assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
288 tp.skipTo(2 * skipInterval - 1);
290 assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
291 tp.skipTo(3 * skipInterval - 1);
293 assertEquals("Wrong payload length.", 3 * skipInterval - 2 * numDocs - 1, tp.getPayloadLength());
296 * Test multiple call of getPayload()
298 tp.getPayload(null, 0);
300 // it is forbidden to call getPayload() more than once
301 // without calling nextPosition()
302 tp.getPayload(null, 0);
303 fail("Expected exception not thrown");
304 } catch (Exception expected) {
305 // expected exception
311 analyzer = new PayloadAnalyzer();
312 writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT,
313 analyzer).setOpenMode(OpenMode.CREATE));
314 String singleTerm = "lucene";
317 d.add(newField(fieldName, singleTerm, Field.Store.NO, Field.Index.ANALYZED));
318 // add a payload whose length is greater than the buffer size of BufferedIndexOutput
319 payloadData = generateRandomData(2000);
320 analyzer.setPayloadData(fieldName, payloadData, 100, 1500);
321 writer.addDocument(d);
328 reader = IndexReader.open(dir, true);
329 tp = reader.termPositions(new Term(fieldName, singleTerm));
333 verifyPayloadData = new byte[tp.getPayloadLength()];
334 tp.getPayload(verifyPayloadData, 0);
335 byte[] portion = new byte[1500];
336 System.arraycopy(payloadData, 100, portion, 0, 1500);
338 assertByteArrayEquals(portion, verifyPayloadData);
343 private void generateRandomData(byte[] data) {
344 random.nextBytes(data);
347 private byte[] generateRandomData(int n) {
348 byte[] data = new byte[n];
349 generateRandomData(data);
353 private Term[] generateTerms(String fieldName, int n) {
354 int maxDigits = (int) (Math.log(n) / Math.log(10));
355 Term[] terms = new Term[n];
356 StringBuilder sb = new StringBuilder();
357 for (int i = 0; i < n; i++) {
360 int zeros = maxDigits - (int) (Math.log(i) / Math.log(10));
361 for (int j = 0; j < zeros; j++) {
365 terms[i] = new Term(fieldName, sb.toString());
371 void assertByteArrayEquals(byte[] b1, byte[] b2) {
372 if (b1.length != b2.length) {
373 fail("Byte arrays have different lengths: " + b1.length + ", " + b2.length);
376 for (int i = 0; i < b1.length; i++) {
377 if (b1[i] != b2[i]) {
378 fail("Byte arrays different at index " + i + ": " + b1[i] + ", " + b2[i]);
385 * This Analyzer uses an WhitespaceTokenizer and PayloadFilter.
387 private static class PayloadAnalyzer extends Analyzer {
388 Map<String,PayloadData> fieldToData = new HashMap<String,PayloadData>();
390 void setPayloadData(String field, byte[] data, int offset, int length) {
391 fieldToData.put(field, new PayloadData(0, data, offset, length));
394 void setPayloadData(String field, int numFieldInstancesToSkip, byte[] data, int offset, int length) {
395 fieldToData.put(field, new PayloadData(numFieldInstancesToSkip, data, offset, length));
399 public TokenStream tokenStream(String fieldName, Reader reader) {
400 PayloadData payload = fieldToData.get(fieldName);
401 TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
402 if (payload != null) {
403 if (payload.numFieldInstancesToSkip == 0) {
404 ts = new PayloadFilter(ts, payload.data, payload.offset, payload.length);
406 payload.numFieldInstancesToSkip--;
412 private static class PayloadData {
416 int numFieldInstancesToSkip;
418 PayloadData(int skip, byte[] data, int offset, int length) {
419 numFieldInstancesToSkip = skip;
421 this.offset = offset;
422 this.length = length;
429 * This Filter adds payloads to the tokens.
431 private static class PayloadFilter extends TokenFilter {
435 Payload payload = new Payload();
436 PayloadAttribute payloadAtt;
438 public PayloadFilter(TokenStream in, byte[] data, int offset, int length) {
441 this.length = length;
442 this.offset = offset;
443 payloadAtt = addAttribute(PayloadAttribute.class);
447 public boolean incrementToken() throws IOException {
448 boolean hasNext = input.incrementToken();
450 if (offset + length <= data.length) {
451 Payload p = new Payload();
452 payloadAtt.setPayload(p);
453 p.setData(data, offset, length);
456 payloadAtt.setPayload(null);
464 public void testThreadSafety() throws Exception {
465 final int numThreads = 5;
466 final int numDocs = atLeast(50);
467 final ByteArrayPool pool = new ByteArrayPool(numThreads, 5);
469 Directory dir = newDirectory();
470 final IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(
471 TEST_VERSION_CURRENT, new MockAnalyzer(random)));
472 final String field = "test";
474 Thread[] ingesters = new Thread[numThreads];
475 for (int i = 0; i < numThreads; i++) {
476 ingesters[i] = new Thread() {
480 for (int j = 0; j < numDocs; j++) {
481 Document d = new Document();
482 d.add(new Field(field, new PoolingPayloadTokenStream(pool)));
483 writer.addDocument(d);
485 } catch (Exception e) {
491 ingesters[i].start();
494 for (int i = 0; i < numThreads; i++) {
498 IndexReader reader = IndexReader.open(dir, true);
499 TermEnum terms = reader.terms();
500 while (terms.next()) {
501 TermPositions tp = reader.termPositions(terms.term());
503 int freq = tp.freq();
504 for (int i = 0; i < freq; i++) {
506 assertEquals(pool.bytesToString(tp.getPayload(new byte[5], 0)), terms.term().text);
514 assertEquals(pool.size(), numThreads);
517 private class PoolingPayloadTokenStream extends TokenStream {
518 private byte[] payload;
519 private boolean first;
520 private ByteArrayPool pool;
523 CharTermAttribute termAtt;
524 PayloadAttribute payloadAtt;
526 PoolingPayloadTokenStream(ByteArrayPool pool) {
528 payload = pool.get();
529 generateRandomData(payload);
530 term = pool.bytesToString(payload);
532 payloadAtt = addAttribute(PayloadAttribute.class);
533 termAtt = addAttribute(CharTermAttribute.class);
537 public boolean incrementToken() throws IOException {
538 if (!first) return false;
541 termAtt.append(term);
542 payloadAtt.setPayload(new Payload(payload));
547 public void close() throws IOException {
548 pool.release(payload);
553 private static class ByteArrayPool {
554 private List<byte[]> pool;
556 ByteArrayPool(int capacity, int size) {
557 pool = new ArrayList<byte[]>();
558 for (int i = 0; i < capacity; i++) {
559 pool.add(new byte[size]);
563 private UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result();
565 synchronized String bytesToString(byte[] bytes) {
566 String s = new String(bytes);
567 UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8Result);
569 return new String(utf8Result.result, 0, utf8Result.length, "UTF-8");
570 } catch (UnsupportedEncodingException uee) {
575 synchronized byte[] get() {
576 return pool.remove(0);
579 synchronized void release(byte[] b) {
583 synchronized int size() {
588 public void testAcrossFields() throws Exception {
589 Directory dir = newDirectory();
590 RandomIndexWriter writer = new RandomIndexWriter(random, dir,
591 new MockAnalyzer(random, MockTokenizer.WHITESPACE, true));
592 Document doc = new Document();
593 doc.add(new Field("hasMaybepayload", "here we go", Field.Store.YES, Field.Index.ANALYZED));
594 writer.addDocument(doc);
597 writer = new RandomIndexWriter(random, dir,
598 new MockAnalyzer(random, MockTokenizer.WHITESPACE, true));
599 doc = new Document();
600 doc.add(new Field("hasMaybepayload2", "here we go", Field.Store.YES, Field.Index.ANALYZED));
601 writer.addDocument(doc);
602 writer.addDocument(doc);