lucene-java-3.4.0/lucene/src/test/org/apache/lucene/index/TestPayloads.java

   1 package org.apache.lucene.index;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.File;
  21 import java.io.IOException;
  22 import java.io.Reader;
  23 import java.io.UnsupportedEncodingException;
  24 import java.util.ArrayList;
  25 import java.util.HashMap;
  26 import java.util.List;
  27 import java.util.Map;
  28
  29 import org.apache.lucene.analysis.Analyzer;
  30 import org.apache.lucene.analysis.MockAnalyzer;
  31 import org.apache.lucene.analysis.MockTokenizer;
  32 import org.apache.lucene.analysis.TokenFilter;
  33 import org.apache.lucene.analysis.TokenStream;
  34 import org.apache.lucene.analysis.WhitespaceTokenizer;
  35 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
  36 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  37 import org.apache.lucene.document.Document;
  38 import org.apache.lucene.document.Field;
  39 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
  40 import org.apache.lucene.store.Directory;
  41 import org.apache.lucene.util.LuceneTestCase;
  42 import org.apache.lucene.util._TestUtil;
  43
  44
  45 public class TestPayloads extends LuceneTestCase {
  46
  47     // Simple tests to test the Payload class
  48     public void testPayload() throws Exception {
  49         byte[] testData = "This is a test!".getBytes();
  50         Payload payload = new Payload(testData);
  51         assertEquals("Wrong payload length.", testData.length, payload.length());
  52
  53         // test copyTo()
  54         byte[] target = new byte[testData.length - 1];
  55         try {
  56             payload.copyTo(target, 0);
  57             fail("Expected exception not thrown");
  58         } catch (Exception expected) {
  59             // expected exception
  60         }
  61
  62         target = new byte[testData.length + 3];
  63         payload.copyTo(target, 3);
  64
  65         for (int i = 0; i < testData.length; i++) {
  66             assertEquals(testData[i], target[i + 3]);
  67         }
  68
  69
  70         // test toByteArray()
  71         target = payload.toByteArray();
  72         assertByteArrayEquals(testData, target);
  73
  74         // test byteAt()
  75         for (int i = 0; i < testData.length; i++) {
  76             assertEquals(payload.byteAt(i), testData[i]);
  77         }
  78
  79         try {
  80             payload.byteAt(testData.length + 1);
  81             fail("Expected exception not thrown");
  82         } catch (Exception expected) {
  83             // expected exception
  84         }
  85
  86         Payload clone = (Payload) payload.clone();
  87         assertEquals(payload.length(), clone.length());
  88         for (int i = 0; i < payload.length(); i++) {
  89           assertEquals(payload.byteAt(i), clone.byteAt(i));
  90         }
  91
  92     }
  93
  94     // Tests whether the DocumentWriter and SegmentMerger correctly enable the
  95     // payload bit in the FieldInfo
  96     public void testPayloadFieldBit() throws Exception {
  97         Directory ram = newDirectory();
  98         PayloadAnalyzer analyzer = new PayloadAnalyzer();
  99         IndexWriter writer = new IndexWriter(ram, newIndexWriterConfig( TEST_VERSION_CURRENT, analyzer));
 100         Document d = new Document();
 101         // this field won't have any payloads
 102         d.add(newField("f1", "This field has no payloads", Field.Store.NO, Field.Index.ANALYZED));
 103         // this field will have payloads in all docs, however not for all term positions,
 104         // so this field is used to check if the DocumentWriter correctly enables the payloads bit
 105         // even if only some term positions have payloads
 106         d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
 107         d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
 108         // this field is used to verify if the SegmentMerger enables payloads for a field if it has payloads
 109         // enabled in only some documents
 110         d.add(newField("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.ANALYZED));
 111         // only add payload data for field f2
 112         analyzer.setPayloadData("f2", 1, "somedata".getBytes(), 0, 1);
 113         writer.addDocument(d);
 114         // flush
 115         writer.close();
 116
 117         SegmentReader reader = SegmentReader.getOnlySegmentReader(ram);
 118         FieldInfos fi = reader.fieldInfos();
 119         assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
 120         assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
 121         assertFalse("Payload field bit should not be set.", fi.fieldInfo("f3").storePayloads);
 122         reader.close();
 123
 124         // now we add another document which has payloads for field f3 and verify if the SegmentMerger
 125         // enabled payloads for that field
 126         writer = new IndexWriter(ram, newIndexWriterConfig( TEST_VERSION_CURRENT,
 127             analyzer).setOpenMode(OpenMode.CREATE));
 128         d = new Document();
 129         d.add(newField("f1", "This field has no payloads", Field.Store.NO, Field.Index.ANALYZED));
 130         d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
 131         d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
 132         d.add(newField("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.ANALYZED));
 133         // add payload data for field f2 and f3
 134         analyzer.setPayloadData("f2", "somedata".getBytes(), 0, 1);
 135         analyzer.setPayloadData("f3", "somedata".getBytes(), 0, 3);
 136         writer.addDocument(d);
 137         // force merge
 138         writer.optimize();
 139         // flush
 140         writer.close();
 141
 142         reader = SegmentReader.getOnlySegmentReader(ram);
 143         fi = reader.fieldInfos();
 144         assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
 145         assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
 146         assertTrue("Payload field bit should be set.", fi.fieldInfo("f3").storePayloads);
 147         reader.close();
 148         ram.close();
 149     }
 150
 151     // Tests if payloads are correctly stored and loaded using both RamDirectory and FSDirectory
 152     public void testPayloadsEncoding() throws Exception {
 153         // first perform the test using a RAMDirectory
 154         Directory dir = newDirectory();
 155         performTest(dir);
 156         dir.close();
 157         // now use a FSDirectory and repeat same test
 158         File dirName = _TestUtil.getTempDir("test_payloads");
 159         dir = newFSDirectory(dirName);
 160         performTest(dir);
 161        _TestUtil.rmDir(dirName);
 162         dir.close();
 163     }
 164
 165     // builds an index with payloads in the given Directory and performs
 166     // different tests to verify the payload encoding
 167     private void performTest(Directory dir) throws Exception {
 168         PayloadAnalyzer analyzer = new PayloadAnalyzer();
 169         IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(
 170             TEST_VERSION_CURRENT, analyzer)
 171             .setOpenMode(OpenMode.CREATE)
 172             .setMergePolicy(newLogMergePolicy()));
 173
 174         // should be in sync with value in TermInfosWriter
 175         final int skipInterval = 16;
 176
 177         final int numTerms = 5;
 178         final String fieldName = "f1";
 179
 180         int numDocs = skipInterval + 1;
 181         // create content for the test documents with just a few terms
 182         Term[] terms = generateTerms(fieldName, numTerms);
 183         StringBuilder sb = new StringBuilder();
 184         for (int i = 0; i < terms.length; i++) {
 185             sb.append(terms[i].text);
 186             sb.append(" ");
 187         }
 188         String content = sb.toString();
 189
 190
 191         int payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2;
 192         byte[] payloadData = generateRandomData(payloadDataLength);
 193
 194         Document d = new Document();
 195         d.add(newField(fieldName, content, Field.Store.NO, Field.Index.ANALYZED));
 196         // add the same document multiple times to have the same payload lengths for all
 197         // occurrences within two consecutive skip intervals
 198         int offset = 0;
 199         for (int i = 0; i < 2 * numDocs; i++) {
 200             analyzer.setPayloadData(fieldName, payloadData, offset, 1);
 201             offset += numTerms;
 202             writer.addDocument(d);
 203         }
 204
 205         // make sure we create more than one segment to test merging
 206         writer.commit();
 207
 208         // now we make sure to have different payload lengths next at the next skip point
 209         for (int i = 0; i < numDocs; i++) {
 210             analyzer.setPayloadData(fieldName, payloadData, offset, i);
 211             offset += i * numTerms;
 212             writer.addDocument(d);
 213         }
 214
 215         writer.optimize();
 216         // flush
 217         writer.close();
 218
 219
 220         /*
 221          * Verify the index
 222          * first we test if all payloads are stored correctly
 223          */
 224         IndexReader reader = IndexReader.open(dir, true);
 225
 226         byte[] verifyPayloadData = new byte[payloadDataLength];
 227         offset = 0;
 228         TermPositions[] tps = new TermPositions[numTerms];
 229         for (int i = 0; i < numTerms; i++) {
 230             tps[i] = reader.termPositions(terms[i]);
 231         }
 232
 233         while (tps[0].next()) {
 234             for (int i = 1; i < numTerms; i++) {
 235                 tps[i].next();
 236             }
 237             int freq = tps[0].freq();
 238
 239             for (int i = 0; i < freq; i++) {
 240                 for (int j = 0; j < numTerms; j++) {
 241                     tps[j].nextPosition();
 242                     if (tps[j].isPayloadAvailable()) {
 243                       tps[j].getPayload(verifyPayloadData, offset);
 244                       offset += tps[j].getPayloadLength();
 245                     }
 246                 }
 247             }
 248         }
 249
 250         for (int i = 0; i < numTerms; i++) {
 251             tps[i].close();
 252         }
 253
 254         assertByteArrayEquals(payloadData, verifyPayloadData);
 255
 256         /*
 257          *  test lazy skipping
 258          */
 259         TermPositions tp = reader.termPositions(terms[0]);
 260         tp.next();
 261         tp.nextPosition();
 262         // now we don't read this payload
 263         tp.nextPosition();
 264         assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
 265         byte[] payload = tp.getPayload(null, 0);
 266         assertEquals(payload[0], payloadData[numTerms]);
 267         tp.nextPosition();
 268
 269         // we don't read this payload and skip to a different document
 270         tp.skipTo(5);
 271         tp.nextPosition();
 272         assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
 273         payload = tp.getPayload(null, 0);
 274         assertEquals(payload[0], payloadData[5 * numTerms]);
 275
 276
 277         /*
 278          * Test different lengths at skip points
 279          */
 280         tp.seek(terms[1]);
 281         tp.next();
 282         tp.nextPosition();
 283         assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
 284         tp.skipTo(skipInterval - 1);
 285         tp.nextPosition();
 286         assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
 287         tp.skipTo(2 * skipInterval - 1);
 288         tp.nextPosition();
 289         assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
 290         tp.skipTo(3 * skipInterval - 1);
 291         tp.nextPosition();
 292         assertEquals("Wrong payload length.", 3 * skipInterval - 2 * numDocs - 1, tp.getPayloadLength());
 293
 294         /*
 295          * Test multiple call of getPayload()
 296          */
 297         tp.getPayload(null, 0);
 298         try {
 299             // it is forbidden to call getPayload() more than once
 300             // without calling nextPosition()
 301             tp.getPayload(null, 0);
 302             fail("Expected exception not thrown");
 303         } catch (Exception expected) {
 304             // expected exception
 305         }
 306
 307         reader.close();
 308
 309         // test long payload
 310         analyzer = new PayloadAnalyzer();
 311         writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT,
 312             analyzer).setOpenMode(OpenMode.CREATE));
 313         String singleTerm = "lucene";
 314
 315         d = new Document();
 316         d.add(newField(fieldName, singleTerm, Field.Store.NO, Field.Index.ANALYZED));
 317         // add a payload whose length is greater than the buffer size of BufferedIndexOutput
 318         payloadData = generateRandomData(2000);
 319         analyzer.setPayloadData(fieldName, payloadData, 100, 1500);
 320         writer.addDocument(d);
 321
 322
 323         writer.optimize();
 324         // flush
 325         writer.close();
 326
 327         reader = IndexReader.open(dir, true);
 328         tp = reader.termPositions(new Term(fieldName, singleTerm));
 329         tp.next();
 330         tp.nextPosition();
 331
 332         verifyPayloadData = new byte[tp.getPayloadLength()];
 333         tp.getPayload(verifyPayloadData, 0);
 334         byte[] portion = new byte[1500];
 335         System.arraycopy(payloadData, 100, portion, 0, 1500);
 336
 337         assertByteArrayEquals(portion, verifyPayloadData);
 338         reader.close();
 339
 340     }
 341
 342     private void generateRandomData(byte[] data) {
 343       // this test needs the random data to be valid unicode
 344       String s = _TestUtil.randomFixedByteLengthUnicodeString(random, data.length);
 345       byte b[];
 346       try {
 347         b = s.getBytes("UTF-8");
 348       } catch (UnsupportedEncodingException e) {
 349         throw new RuntimeException(e);
 350       }
 351       assert b.length == data.length;
 352       System.arraycopy(b, 0, data, 0, b.length);
 353     }
 354
 355     private byte[] generateRandomData(int n) {
 356         byte[] data = new byte[n];
 357         generateRandomData(data);
 358         return data;
 359     }
 360
 361     private Term[] generateTerms(String fieldName, int n) {
 362         int maxDigits = (int) (Math.log(n) / Math.log(10));
 363         Term[] terms = new Term[n];
 364         StringBuilder sb = new StringBuilder();
 365         for (int i = 0; i < n; i++) {
 366             sb.setLength(0);
 367             sb.append("t");
 368             int zeros = maxDigits - (int) (Math.log(i) / Math.log(10));
 369             for (int j = 0; j < zeros; j++) {
 370                 sb.append("0");
 371             }
 372             sb.append(i);
 373             terms[i] = new Term(fieldName, sb.toString());
 374         }
 375         return terms;
 376     }
 377
 378
 379     void assertByteArrayEquals(byte[] b1, byte[] b2) {
 380         if (b1.length != b2.length) {
 381           fail("Byte arrays have different lengths: " + b1.length + ", " + b2.length);
 382         }
 383
 384         for (int i = 0; i < b1.length; i++) {
 385           if (b1[i] != b2[i]) {
 386             fail("Byte arrays different at index " + i + ": " + b1[i] + ", " + b2[i]);
 387           }
 388         }
 389       }
 390
 391
 392     /**
 393      * This Analyzer uses an WhitespaceTokenizer and PayloadFilter.
 394      */
 395     private static class PayloadAnalyzer extends Analyzer {
 396         Map<String,PayloadData> fieldToData = new HashMap<String,PayloadData>();
 397
 398         void setPayloadData(String field, byte[] data, int offset, int length) {
 399             fieldToData.put(field, new PayloadData(0, data, offset, length));
 400         }
 401
 402         void setPayloadData(String field, int numFieldInstancesToSkip, byte[] data, int offset, int length) {
 403             fieldToData.put(field, new PayloadData(numFieldInstancesToSkip, data, offset, length));
 404         }
 405
 406         @Override
 407         public TokenStream tokenStream(String fieldName, Reader reader) {
 408             PayloadData payload =  fieldToData.get(fieldName);
 409             TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
 410             if (payload != null) {
 411                 if (payload.numFieldInstancesToSkip == 0) {
 412                     ts = new PayloadFilter(ts, payload.data, payload.offset, payload.length);
 413                 } else {
 414                     payload.numFieldInstancesToSkip--;
 415                 }
 416             }
 417             return ts;
 418         }
 419
 420         private static class PayloadData {
 421             byte[] data;
 422             int offset;
 423             int length;
 424             int numFieldInstancesToSkip;
 425
 426             PayloadData(int skip, byte[] data, int offset, int length) {
 427                 numFieldInstancesToSkip = skip;
 428                 this.data = data;
 429                 this.offset = offset;
 430                 this.length = length;
 431             }
 432         }
 433     }
 434
 435
 436     /**
 437      * This Filter adds payloads to the tokens.
 438      */
 439     private static class PayloadFilter extends TokenFilter {
 440         private byte[] data;
 441         private int length;
 442         private int offset;
 443         private int startOffset;
 444         PayloadAttribute payloadAtt;
 445
 446         public PayloadFilter(TokenStream in, byte[] data, int offset, int length) {
 447             super(in);
 448             this.data = data;
 449             this.length = length;
 450             this.offset = offset;
 451             this.startOffset = offset;
 452             payloadAtt = addAttribute(PayloadAttribute.class);
 453         }
 454
 455         @Override
 456         public boolean incrementToken() throws IOException {
 457             boolean hasNext = input.incrementToken();
 458             if (hasNext) {
 459                 if (offset + length <= data.length) {
 460                     Payload p = new Payload();
 461                     payloadAtt.setPayload(p);
 462                     p.setData(data, offset, length);
 463                     offset += length;
 464                 } else {
 465                     payloadAtt.setPayload(null);
 466                 }
 467             }
 468
 469             return hasNext;
 470         }
 471
 472       @Override
 473       public void reset() throws IOException {
 474         super.reset();
 475         this.offset = startOffset;
 476       }
 477     }
 478
 479     public void testThreadSafety() throws Exception {
 480         final int numThreads = 5;
 481         final int numDocs = atLeast(50);
 482         final ByteArrayPool pool = new ByteArrayPool(numThreads, 5);
 483
 484         Directory dir = newDirectory();
 485         final IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(
 486             TEST_VERSION_CURRENT, new MockAnalyzer(random)));
 487         final String field = "test";
 488
 489         Thread[] ingesters = new Thread[numThreads];
 490         for (int i = 0; i < numThreads; i++) {
 491             ingesters[i] = new Thread() {
 492                 @Override
 493                 public void run() {
 494                     try {
 495                         for (int j = 0; j < numDocs; j++) {
 496                             Document d = new Document();
 497                             d.add(new Field(field, new PoolingPayloadTokenStream(pool)));
 498                             writer.addDocument(d);
 499                         }
 500                     } catch (Exception e) {
 501                         e.printStackTrace();
 502                         fail(e.toString());
 503                     }
 504                 }
 505             };
 506             ingesters[i].start();
 507         }
 508
 509         for (int i = 0; i < numThreads; i++) {
 510           ingesters[i].join();
 511         }
 512         writer.close();
 513         IndexReader reader = IndexReader.open(dir, true);
 514         TermEnum terms = reader.terms();
 515         while (terms.next()) {
 516             TermPositions tp = reader.termPositions(terms.term());
 517             while(tp.next()) {
 518                 int freq = tp.freq();
 519                 for (int i = 0; i < freq; i++) {
 520                     tp.nextPosition();
 521                     byte payload[] = new byte[5];
 522                     tp.getPayload(payload, 0);
 523                     assertEquals(terms.term().text, new String(payload, 0, payload.length, "UTF-8"));
 524                 }
 525             }
 526             tp.close();
 527         }
 528         terms.close();
 529         reader.close();
 530         dir.close();
 531         assertEquals(pool.size(), numThreads);
 532     }
 533
 534     private class PoolingPayloadTokenStream extends TokenStream {
 535         private byte[] payload;
 536         private boolean first;
 537         private ByteArrayPool pool;
 538         private String term;
 539
 540         CharTermAttribute termAtt;
 541         PayloadAttribute payloadAtt;
 542
 543         PoolingPayloadTokenStream(ByteArrayPool pool) {
 544             this.pool = pool;
 545             payload = pool.get();
 546             generateRandomData(payload);
 547             try {
 548               term = new String(payload, 0, payload.length, "UTF-8");
 549             } catch (UnsupportedEncodingException e) {
 550               throw new RuntimeException(e);
 551             }
 552             first = true;
 553             payloadAtt = addAttribute(PayloadAttribute.class);
 554             termAtt = addAttribute(CharTermAttribute.class);
 555         }
 556
 557         @Override
 558         public boolean incrementToken() throws IOException {
 559             if (!first) return false;
 560             first = false;
 561             clearAttributes();
 562             termAtt.append(term);
 563             payloadAtt.setPayload(new Payload(payload));
 564             return true;
 565         }
 566
 567         @Override
 568         public void close() throws IOException {
 569             pool.release(payload);
 570         }
 571
 572     }
 573
 574     private static class ByteArrayPool {
 575         private List<byte[]> pool;
 576
 577         ByteArrayPool(int capacity, int size) {
 578             pool = new ArrayList<byte[]>();
 579             for (int i = 0; i < capacity; i++) {
 580                 pool.add(new byte[size]);
 581             }
 582         }
 583
 584         synchronized byte[] get() {
 585             return pool.remove(0);
 586         }
 587
 588         synchronized void release(byte[] b) {
 589             pool.add(b);
 590         }
 591
 592         synchronized int size() {
 593             return pool.size();
 594         }
 595     }
 596
 597   public void testAcrossFields() throws Exception {
 598     Directory dir = newDirectory();
 599     RandomIndexWriter writer = new RandomIndexWriter(random, dir,
 600                                                      new MockAnalyzer(random, MockTokenizer.WHITESPACE, true));
 601     Document doc = new Document();
 602     doc.add(new Field("hasMaybepayload", "here we go", Field.Store.YES, Field.Index.ANALYZED));
 603     writer.addDocument(doc);
 604     writer.close();
 605
 606     writer = new RandomIndexWriter(random, dir,
 607                                    new MockAnalyzer(random, MockTokenizer.WHITESPACE, true));
 608     doc = new Document();
 609     doc.add(new Field("hasMaybepayload2", "here we go", Field.Store.YES, Field.Index.ANALYZED));
 610     writer.addDocument(doc);
 611     writer.addDocument(doc);
 612     writer.optimize();
 613     writer.close();
 614
 615     dir.close();
 616   }
 617 }