--- /dev/null
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
+
+
+public class TestPayloads extends LuceneTestCase {
+
+ // Simple tests to test the Payload class
+ public void testPayload() throws Exception {
+ byte[] testData = "This is a test!".getBytes();
+ Payload payload = new Payload(testData);
+ assertEquals("Wrong payload length.", testData.length, payload.length());
+
+ // test copyTo()
+ byte[] target = new byte[testData.length - 1];
+ try {
+ payload.copyTo(target, 0);
+ fail("Expected exception not thrown");
+ } catch (Exception expected) {
+ // expected exception
+ }
+
+ target = new byte[testData.length + 3];
+ payload.copyTo(target, 3);
+
+ for (int i = 0; i < testData.length; i++) {
+ assertEquals(testData[i], target[i + 3]);
+ }
+
+
+ // test toByteArray()
+ target = payload.toByteArray();
+ assertByteArrayEquals(testData, target);
+
+ // test byteAt()
+ for (int i = 0; i < testData.length; i++) {
+ assertEquals(payload.byteAt(i), testData[i]);
+ }
+
+ try {
+ payload.byteAt(testData.length + 1);
+ fail("Expected exception not thrown");
+ } catch (Exception expected) {
+ // expected exception
+ }
+
+ Payload clone = (Payload) payload.clone();
+ assertEquals(payload.length(), clone.length());
+ for (int i = 0; i < payload.length(); i++) {
+ assertEquals(payload.byteAt(i), clone.byteAt(i));
+ }
+
+ }
+
+ // Tests whether the DocumentWriter and SegmentMerger correctly enable the
+ // payload bit in the FieldInfo
+ public void testPayloadFieldBit() throws Exception {
+ Directory ram = newDirectory();
+ PayloadAnalyzer analyzer = new PayloadAnalyzer();
+ IndexWriter writer = new IndexWriter(ram, newIndexWriterConfig( TEST_VERSION_CURRENT, analyzer));
+ Document d = new Document();
+ // this field won't have any payloads
+ d.add(newField("f1", "This field has no payloads", Field.Store.NO, Field.Index.ANALYZED));
+ // this field will have payloads in all docs, however not for all term positions,
+ // so this field is used to check if the DocumentWriter correctly enables the payloads bit
+ // even if only some term positions have payloads
+ d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
+ d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
+ // this field is used to verify if the SegmentMerger enables payloads for a field if it has payloads
+ // enabled in only some documents
+ d.add(newField("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.ANALYZED));
+ // only add payload data for field f2
+ analyzer.setPayloadData("f2", 1, "somedata".getBytes(), 0, 1);
+ writer.addDocument(d);
+ // flush
+ writer.close();
+
+ SegmentReader reader = SegmentReader.getOnlySegmentReader(ram);
+ FieldInfos fi = reader.fieldInfos();
+ assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
+ assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
+ assertFalse("Payload field bit should not be set.", fi.fieldInfo("f3").storePayloads);
+ reader.close();
+
+ // now we add another document which has payloads for field f3 and verify if the SegmentMerger
+ // enabled payloads for that field
+ writer = new IndexWriter(ram, newIndexWriterConfig( TEST_VERSION_CURRENT,
+ analyzer).setOpenMode(OpenMode.CREATE));
+ d = new Document();
+ d.add(newField("f1", "This field has no payloads", Field.Store.NO, Field.Index.ANALYZED));
+ d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
+ d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
+ d.add(newField("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.ANALYZED));
+ // add payload data for field f2 and f3
+ analyzer.setPayloadData("f2", "somedata".getBytes(), 0, 1);
+ analyzer.setPayloadData("f3", "somedata".getBytes(), 0, 3);
+ writer.addDocument(d);
+ // force merge
+ writer.optimize();
+ // flush
+ writer.close();
+
+ reader = SegmentReader.getOnlySegmentReader(ram);
+ fi = reader.fieldInfos();
+ assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
+ assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
+ assertTrue("Payload field bit should be set.", fi.fieldInfo("f3").storePayloads);
+ reader.close();
+ ram.close();
+ }
+
+ // Tests if payloads are correctly stored and loaded using both RamDirectory and FSDirectory
+ public void testPayloadsEncoding() throws Exception {
+ // first perform the test using a RAMDirectory
+ Directory dir = newDirectory();
+ performTest(dir);
+ dir.close();
+ // now use a FSDirectory and repeat same test
+ File dirName = _TestUtil.getTempDir("test_payloads");
+ dir = newFSDirectory(dirName);
+ performTest(dir);
+ _TestUtil.rmDir(dirName);
+ dir.close();
+ }
+
+ // builds an index with payloads in the given Directory and performs
+ // different tests to verify the payload encoding
+ private void performTest(Directory dir) throws Exception {
+ PayloadAnalyzer analyzer = new PayloadAnalyzer();
+ IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(
+ TEST_VERSION_CURRENT, analyzer)
+ .setOpenMode(OpenMode.CREATE)
+ .setMergePolicy(newLogMergePolicy()));
+
+ // should be in sync with value in TermInfosWriter
+ final int skipInterval = 16;
+
+ final int numTerms = 5;
+ final String fieldName = "f1";
+
+ int numDocs = skipInterval + 1;
+ // create content for the test documents with just a few terms
+ Term[] terms = generateTerms(fieldName, numTerms);
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < terms.length; i++) {
+ sb.append(terms[i].text);
+ sb.append(" ");
+ }
+ String content = sb.toString();
+
+
+ int payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2;
+ byte[] payloadData = generateRandomData(payloadDataLength);
+
+ Document d = new Document();
+ d.add(newField(fieldName, content, Field.Store.NO, Field.Index.ANALYZED));
+ // add the same document multiple times to have the same payload lengths for all
+ // occurrences within two consecutive skip intervals
+ int offset = 0;
+ for (int i = 0; i < 2 * numDocs; i++) {
+ analyzer.setPayloadData(fieldName, payloadData, offset, 1);
+ offset += numTerms;
+ writer.addDocument(d);
+ }
+
+ // make sure we create more than one segment to test merging
+ writer.commit();
+
+ // now we make sure to have different payload lengths next at the next skip point
+ for (int i = 0; i < numDocs; i++) {
+ analyzer.setPayloadData(fieldName, payloadData, offset, i);
+ offset += i * numTerms;
+ writer.addDocument(d);
+ }
+
+ writer.optimize();
+ // flush
+ writer.close();
+
+
+ /*
+ * Verify the index
+ * first we test if all payloads are stored correctly
+ */
+ IndexReader reader = IndexReader.open(dir, true);
+
+ byte[] verifyPayloadData = new byte[payloadDataLength];
+ offset = 0;
+ TermPositions[] tps = new TermPositions[numTerms];
+ for (int i = 0; i < numTerms; i++) {
+ tps[i] = reader.termPositions(terms[i]);
+ }
+
+ while (tps[0].next()) {
+ for (int i = 1; i < numTerms; i++) {
+ tps[i].next();
+ }
+ int freq = tps[0].freq();
+
+ for (int i = 0; i < freq; i++) {
+ for (int j = 0; j < numTerms; j++) {
+ tps[j].nextPosition();
+ if (tps[j].isPayloadAvailable()) {
+ tps[j].getPayload(verifyPayloadData, offset);
+ offset += tps[j].getPayloadLength();
+ }
+ }
+ }
+ }
+
+ for (int i = 0; i < numTerms; i++) {
+ tps[i].close();
+ }
+
+ assertByteArrayEquals(payloadData, verifyPayloadData);
+
+ /*
+ * test lazy skipping
+ */
+ TermPositions tp = reader.termPositions(terms[0]);
+ tp.next();
+ tp.nextPosition();
+ // now we don't read this payload
+ tp.nextPosition();
+ assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
+ byte[] payload = tp.getPayload(null, 0);
+ assertEquals(payload[0], payloadData[numTerms]);
+ tp.nextPosition();
+
+ // we don't read this payload and skip to a different document
+ tp.skipTo(5);
+ tp.nextPosition();
+ assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
+ payload = tp.getPayload(null, 0);
+ assertEquals(payload[0], payloadData[5 * numTerms]);
+
+
+ /*
+ * Test different lengths at skip points
+ */
+ tp.seek(terms[1]);
+ tp.next();
+ tp.nextPosition();
+ assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
+ tp.skipTo(skipInterval - 1);
+ tp.nextPosition();
+ assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
+ tp.skipTo(2 * skipInterval - 1);
+ tp.nextPosition();
+ assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
+ tp.skipTo(3 * skipInterval - 1);
+ tp.nextPosition();
+ assertEquals("Wrong payload length.", 3 * skipInterval - 2 * numDocs - 1, tp.getPayloadLength());
+
+ /*
+ * Test multiple call of getPayload()
+ */
+ tp.getPayload(null, 0);
+ try {
+ // it is forbidden to call getPayload() more than once
+ // without calling nextPosition()
+ tp.getPayload(null, 0);
+ fail("Expected exception not thrown");
+ } catch (Exception expected) {
+ // expected exception
+ }
+
+ reader.close();
+
+ // test long payload
+ analyzer = new PayloadAnalyzer();
+ writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT,
+ analyzer).setOpenMode(OpenMode.CREATE));
+ String singleTerm = "lucene";
+
+ d = new Document();
+ d.add(newField(fieldName, singleTerm, Field.Store.NO, Field.Index.ANALYZED));
+ // add a payload whose length is greater than the buffer size of BufferedIndexOutput
+ payloadData = generateRandomData(2000);
+ analyzer.setPayloadData(fieldName, payloadData, 100, 1500);
+ writer.addDocument(d);
+
+
+ writer.optimize();
+ // flush
+ writer.close();
+
+ reader = IndexReader.open(dir, true);
+ tp = reader.termPositions(new Term(fieldName, singleTerm));
+ tp.next();
+ tp.nextPosition();
+
+ verifyPayloadData = new byte[tp.getPayloadLength()];
+ tp.getPayload(verifyPayloadData, 0);
+ byte[] portion = new byte[1500];
+ System.arraycopy(payloadData, 100, portion, 0, 1500);
+
+ assertByteArrayEquals(portion, verifyPayloadData);
+ reader.close();
+
+ }
+
+ private void generateRandomData(byte[] data) {
+ // this test needs the random data to be valid unicode
+ String s = _TestUtil.randomFixedByteLengthUnicodeString(random, data.length);
+ byte b[];
+ try {
+ b = s.getBytes("UTF-8");
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(e);
+ }
+ assert b.length == data.length;
+ System.arraycopy(b, 0, data, 0, b.length);
+ }
+
+ private byte[] generateRandomData(int n) {
+ byte[] data = new byte[n];
+ generateRandomData(data);
+ return data;
+ }
+
+ private Term[] generateTerms(String fieldName, int n) {
+ int maxDigits = (int) (Math.log(n) / Math.log(10));
+ Term[] terms = new Term[n];
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < n; i++) {
+ sb.setLength(0);
+ sb.append("t");
+ int zeros = maxDigits - (int) (Math.log(i) / Math.log(10));
+ for (int j = 0; j < zeros; j++) {
+ sb.append("0");
+ }
+ sb.append(i);
+ terms[i] = new Term(fieldName, sb.toString());
+ }
+ return terms;
+ }
+
+
+ void assertByteArrayEquals(byte[] b1, byte[] b2) {
+ if (b1.length != b2.length) {
+ fail("Byte arrays have different lengths: " + b1.length + ", " + b2.length);
+ }
+
+ for (int i = 0; i < b1.length; i++) {
+ if (b1[i] != b2[i]) {
+ fail("Byte arrays different at index " + i + ": " + b1[i] + ", " + b2[i]);
+ }
+ }
+ }
+
+
+ /**
+ * This Analyzer uses an WhitespaceTokenizer and PayloadFilter.
+ */
+ private static class PayloadAnalyzer extends Analyzer {
+ Map<String,PayloadData> fieldToData = new HashMap<String,PayloadData>();
+
+ void setPayloadData(String field, byte[] data, int offset, int length) {
+ fieldToData.put(field, new PayloadData(0, data, offset, length));
+ }
+
+ void setPayloadData(String field, int numFieldInstancesToSkip, byte[] data, int offset, int length) {
+ fieldToData.put(field, new PayloadData(numFieldInstancesToSkip, data, offset, length));
+ }
+
+ @Override
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ PayloadData payload = fieldToData.get(fieldName);
+ TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+ if (payload != null) {
+ if (payload.numFieldInstancesToSkip == 0) {
+ ts = new PayloadFilter(ts, payload.data, payload.offset, payload.length);
+ } else {
+ payload.numFieldInstancesToSkip--;
+ }
+ }
+ return ts;
+ }
+
+ private static class PayloadData {
+ byte[] data;
+ int offset;
+ int length;
+ int numFieldInstancesToSkip;
+
+ PayloadData(int skip, byte[] data, int offset, int length) {
+ numFieldInstancesToSkip = skip;
+ this.data = data;
+ this.offset = offset;
+ this.length = length;
+ }
+ }
+ }
+
+
+ /**
+ * This Filter adds payloads to the tokens.
+ */
+ private static class PayloadFilter extends TokenFilter {
+ private byte[] data;
+ private int length;
+ private int offset;
+ private int startOffset;
+ PayloadAttribute payloadAtt;
+
+ public PayloadFilter(TokenStream in, byte[] data, int offset, int length) {
+ super(in);
+ this.data = data;
+ this.length = length;
+ this.offset = offset;
+ this.startOffset = offset;
+ payloadAtt = addAttribute(PayloadAttribute.class);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ boolean hasNext = input.incrementToken();
+ if (hasNext) {
+ if (offset + length <= data.length) {
+ Payload p = new Payload();
+ payloadAtt.setPayload(p);
+ p.setData(data, offset, length);
+ offset += length;
+ } else {
+ payloadAtt.setPayload(null);
+ }
+ }
+
+ return hasNext;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ this.offset = startOffset;
+ }
+ }
+
+ public void testThreadSafety() throws Exception {
+ final int numThreads = 5;
+ final int numDocs = atLeast(50);
+ final ByteArrayPool pool = new ByteArrayPool(numThreads, 5);
+
+ Directory dir = newDirectory();
+ final IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(
+ TEST_VERSION_CURRENT, new MockAnalyzer(random)));
+ final String field = "test";
+
+ Thread[] ingesters = new Thread[numThreads];
+ for (int i = 0; i < numThreads; i++) {
+ ingesters[i] = new Thread() {
+ @Override
+ public void run() {
+ try {
+ for (int j = 0; j < numDocs; j++) {
+ Document d = new Document();
+ d.add(new Field(field, new PoolingPayloadTokenStream(pool)));
+ writer.addDocument(d);
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.toString());
+ }
+ }
+ };
+ ingesters[i].start();
+ }
+
+ for (int i = 0; i < numThreads; i++) {
+ ingesters[i].join();
+ }
+ writer.close();
+ IndexReader reader = IndexReader.open(dir, true);
+ TermEnum terms = reader.terms();
+ while (terms.next()) {
+ TermPositions tp = reader.termPositions(terms.term());
+ while(tp.next()) {
+ int freq = tp.freq();
+ for (int i = 0; i < freq; i++) {
+ tp.nextPosition();
+ byte payload[] = new byte[5];
+ tp.getPayload(payload, 0);
+ assertEquals(terms.term().text, new String(payload, 0, payload.length, "UTF-8"));
+ }
+ }
+ tp.close();
+ }
+ terms.close();
+ reader.close();
+ dir.close();
+ assertEquals(pool.size(), numThreads);
+ }
+
+ private class PoolingPayloadTokenStream extends TokenStream {
+ private byte[] payload;
+ private boolean first;
+ private ByteArrayPool pool;
+ private String term;
+
+ CharTermAttribute termAtt;
+ PayloadAttribute payloadAtt;
+
+ PoolingPayloadTokenStream(ByteArrayPool pool) {
+ this.pool = pool;
+ payload = pool.get();
+ generateRandomData(payload);
+ try {
+ term = new String(payload, 0, payload.length, "UTF-8");
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(e);
+ }
+ first = true;
+ payloadAtt = addAttribute(PayloadAttribute.class);
+ termAtt = addAttribute(CharTermAttribute.class);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (!first) return false;
+ first = false;
+ clearAttributes();
+ termAtt.append(term);
+ payloadAtt.setPayload(new Payload(payload));
+ return true;
+ }
+
+ @Override
+ public void close() throws IOException {
+ pool.release(payload);
+ }
+
+ }
+
+ private static class ByteArrayPool {
+ private List<byte[]> pool;
+
+ ByteArrayPool(int capacity, int size) {
+ pool = new ArrayList<byte[]>();
+ for (int i = 0; i < capacity; i++) {
+ pool.add(new byte[size]);
+ }
+ }
+
+ synchronized byte[] get() {
+ return pool.remove(0);
+ }
+
+ synchronized void release(byte[] b) {
+ pool.add(b);
+ }
+
+ synchronized int size() {
+ return pool.size();
+ }
+ }
+
+ public void testAcrossFields() throws Exception {
+ Directory dir = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random, dir,
+ new MockAnalyzer(random, MockTokenizer.WHITESPACE, true));
+ Document doc = new Document();
+ doc.add(new Field("hasMaybepayload", "here we go", Field.Store.YES, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+ writer.close();
+
+ writer = new RandomIndexWriter(random, dir,
+ new MockAnalyzer(random, MockTokenizer.WHITESPACE, true));
+ doc = new Document();
+ doc.add(new Field("hasMaybepayload2", "here we go", Field.Store.YES, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+ writer.addDocument(doc);
+ writer.optimize();
+ writer.close();
+
+ dir.close();
+ }
+}