lucene-java-3.4.0/lucene/backwards/src/test/org/apache/lucene/index/TestPayloads.java

   1 package org.apache.lucene.index;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.File;
  21 import java.io.IOException;
  22 import java.io.Reader;
  23 import java.io.UnsupportedEncodingException;
  24 import java.util.ArrayList;
  25 import java.util.HashMap;
  26 import java.util.List;
  27 import java.util.Map;
  28
  29 import org.apache.lucene.analysis.Analyzer;
  30 import org.apache.lucene.analysis.MockAnalyzer;
  31 import org.apache.lucene.analysis.MockTokenizer;
  32 import org.apache.lucene.analysis.TokenFilter;
  33 import org.apache.lucene.analysis.TokenStream;
  34 import org.apache.lucene.analysis.WhitespaceTokenizer;
  35 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
  36 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  37 import org.apache.lucene.document.Document;
  38 import org.apache.lucene.document.Field;
  39 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
  40 import org.apache.lucene.store.Directory;
  41 import org.apache.lucene.util.LuceneTestCase;
  42 import org.apache.lucene.util.UnicodeUtil;
  43 import org.apache.lucene.util._TestUtil;
  44
  45
  46 public class TestPayloads extends LuceneTestCase {
  47
  48     // Simple tests to test the Payload class
  49     public void testPayload() throws Exception {
  50         byte[] testData = "This is a test!".getBytes();
  51         Payload payload = new Payload(testData);
  52         assertEquals("Wrong payload length.", testData.length, payload.length());
  53
  54         // test copyTo()
  55         byte[] target = new byte[testData.length - 1];
  56         try {
  57             payload.copyTo(target, 0);
  58             fail("Expected exception not thrown");
  59         } catch (Exception expected) {
  60             // expected exception
  61         }
  62
  63         target = new byte[testData.length + 3];
  64         payload.copyTo(target, 3);
  65
  66         for (int i = 0; i < testData.length; i++) {
  67             assertEquals(testData[i], target[i + 3]);
  68         }
  69
  70
  71         // test toByteArray()
  72         target = payload.toByteArray();
  73         assertByteArrayEquals(testData, target);
  74
  75         // test byteAt()
  76         for (int i = 0; i < testData.length; i++) {
  77             assertEquals(payload.byteAt(i), testData[i]);
  78         }
  79
  80         try {
  81             payload.byteAt(testData.length + 1);
  82             fail("Expected exception not thrown");
  83         } catch (Exception expected) {
  84             // expected exception
  85         }
  86
  87         Payload clone = (Payload) payload.clone();
  88         assertEquals(payload.length(), clone.length());
  89         for (int i = 0; i < payload.length(); i++) {
  90           assertEquals(payload.byteAt(i), clone.byteAt(i));
  91         }
  92
  93     }
  94
  95     // Tests whether the DocumentWriter and SegmentMerger correctly enable the
  96     // payload bit in the FieldInfo
  97     public void testPayloadFieldBit() throws Exception {
  98         Directory ram = newDirectory();
  99         PayloadAnalyzer analyzer = new PayloadAnalyzer();
 100         IndexWriter writer = new IndexWriter(ram, newIndexWriterConfig( TEST_VERSION_CURRENT, analyzer));
 101         Document d = new Document();
 102         // this field won't have any payloads
 103         d.add(newField("f1", "This field has no payloads", Field.Store.NO, Field.Index.ANALYZED));
 104         // this field will have payloads in all docs, however not for all term positions,
 105         // so this field is used to check if the DocumentWriter correctly enables the payloads bit
 106         // even if only some term positions have payloads
 107         d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
 108         d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
 109         // this field is used to verify if the SegmentMerger enables payloads for a field if it has payloads
 110         // enabled in only some documents
 111         d.add(newField("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.ANALYZED));
 112         // only add payload data for field f2
 113         analyzer.setPayloadData("f2", 1, "somedata".getBytes(), 0, 1);
 114         writer.addDocument(d);
 115         // flush
 116         writer.close();
 117
 118         SegmentReader reader = SegmentReader.getOnlySegmentReader(ram);
 119         FieldInfos fi = reader.fieldInfos();
 120         assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
 121         assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
 122         assertFalse("Payload field bit should not be set.", fi.fieldInfo("f3").storePayloads);
 123         reader.close();
 124
 125         // now we add another document which has payloads for field f3 and verify if the SegmentMerger
 126         // enabled payloads for that field
 127         writer = new IndexWriter(ram, newIndexWriterConfig( TEST_VERSION_CURRENT,
 128             analyzer).setOpenMode(OpenMode.CREATE));
 129         d = new Document();
 130         d.add(newField("f1", "This field has no payloads", Field.Store.NO, Field.Index.ANALYZED));
 131         d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
 132         d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
 133         d.add(newField("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.ANALYZED));
 134         // add payload data for field f2 and f3
 135         analyzer.setPayloadData("f2", "somedata".getBytes(), 0, 1);
 136         analyzer.setPayloadData("f3", "somedata".getBytes(), 0, 3);
 137         writer.addDocument(d);
 138         // force merge
 139         writer.optimize();
 140         // flush
 141         writer.close();
 142
 143         reader = SegmentReader.getOnlySegmentReader(ram);
 144         fi = reader.fieldInfos();
 145         assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
 146         assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
 147         assertTrue("Payload field bit should be set.", fi.fieldInfo("f3").storePayloads);
 148         reader.close();
 149         ram.close();
 150     }
 151
 152     // Tests if payloads are correctly stored and loaded using both RamDirectory and FSDirectory
 153     public void testPayloadsEncoding() throws Exception {
 154         // first perform the test using a RAMDirectory
 155         Directory dir = newDirectory();
 156         performTest(dir);
 157         dir.close();
 158         // now use a FSDirectory and repeat same test
 159         File dirName = _TestUtil.getTempDir("test_payloads");
 160         dir = newFSDirectory(dirName);
 161         performTest(dir);
 162        _TestUtil.rmDir(dirName);
 163         dir.close();
 164     }
 165
 166     // builds an index with payloads in the given Directory and performs
 167     // different tests to verify the payload encoding
 168     private void performTest(Directory dir) throws Exception {
 169         PayloadAnalyzer analyzer = new PayloadAnalyzer();
 170         IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(
 171             TEST_VERSION_CURRENT, analyzer)
 172             .setOpenMode(OpenMode.CREATE)
 173             .setMergePolicy(newLogMergePolicy()));
 174
 175         // should be in sync with value in TermInfosWriter
 176         final int skipInterval = 16;
 177
 178         final int numTerms = 5;
 179         final String fieldName = "f1";
 180
 181         int numDocs = skipInterval + 1;
 182         // create content for the test documents with just a few terms
 183         Term[] terms = generateTerms(fieldName, numTerms);
 184         StringBuilder sb = new StringBuilder();
 185         for (int i = 0; i < terms.length; i++) {
 186             sb.append(terms[i].text);
 187             sb.append(" ");
 188         }
 189         String content = sb.toString();
 190
 191
 192         int payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2;
 193         byte[] payloadData = generateRandomData(payloadDataLength);
 194
 195         Document d = new Document();
 196         d.add(newField(fieldName, content, Field.Store.NO, Field.Index.ANALYZED));
 197         // add the same document multiple times to have the same payload lengths for all
 198         // occurrences within two consecutive skip intervals
 199         int offset = 0;
 200         for (int i = 0; i < 2 * numDocs; i++) {
 201             analyzer.setPayloadData(fieldName, payloadData, offset, 1);
 202             offset += numTerms;
 203             writer.addDocument(d);
 204         }
 205
 206         // make sure we create more than one segment to test merging
 207         writer.commit();
 208
 209         // now we make sure to have different payload lengths next at the next skip point
 210         for (int i = 0; i < numDocs; i++) {
 211             analyzer.setPayloadData(fieldName, payloadData, offset, i);
 212             offset += i * numTerms;
 213             writer.addDocument(d);
 214         }
 215
 216         writer.optimize();
 217         // flush
 218         writer.close();
 219
 220
 221         /*
 222          * Verify the index
 223          * first we test if all payloads are stored correctly
 224          */
 225         IndexReader reader = IndexReader.open(dir, true);
 226
 227         byte[] verifyPayloadData = new byte[payloadDataLength];
 228         offset = 0;
 229         TermPositions[] tps = new TermPositions[numTerms];
 230         for (int i = 0; i < numTerms; i++) {
 231             tps[i] = reader.termPositions(terms[i]);
 232         }
 233
 234         while (tps[0].next()) {
 235             for (int i = 1; i < numTerms; i++) {
 236                 tps[i].next();
 237             }
 238             int freq = tps[0].freq();
 239
 240             for (int i = 0; i < freq; i++) {
 241                 for (int j = 0; j < numTerms; j++) {
 242                     tps[j].nextPosition();
 243                     if (tps[j].isPayloadAvailable()) {
 244                       tps[j].getPayload(verifyPayloadData, offset);
 245                       offset += tps[j].getPayloadLength();
 246                     }
 247                 }
 248             }
 249         }
 250
 251         for (int i = 0; i < numTerms; i++) {
 252             tps[i].close();
 253         }
 254
 255         assertByteArrayEquals(payloadData, verifyPayloadData);
 256
 257         /*
 258          *  test lazy skipping
 259          */
 260         TermPositions tp = reader.termPositions(terms[0]);
 261         tp.next();
 262         tp.nextPosition();
 263         // now we don't read this payload
 264         tp.nextPosition();
 265         assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
 266         byte[] payload = tp.getPayload(null, 0);
 267         assertEquals(payload[0], payloadData[numTerms]);
 268         tp.nextPosition();
 269
 270         // we don't read this payload and skip to a different document
 271         tp.skipTo(5);
 272         tp.nextPosition();
 273         assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
 274         payload = tp.getPayload(null, 0);
 275         assertEquals(payload[0], payloadData[5 * numTerms]);
 276
 277
 278         /*
 279          * Test different lengths at skip points
 280          */
 281         tp.seek(terms[1]);
 282         tp.next();
 283         tp.nextPosition();
 284         assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
 285         tp.skipTo(skipInterval - 1);
 286         tp.nextPosition();
 287         assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
 288         tp.skipTo(2 * skipInterval - 1);
 289         tp.nextPosition();
 290         assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
 291         tp.skipTo(3 * skipInterval - 1);
 292         tp.nextPosition();
 293         assertEquals("Wrong payload length.", 3 * skipInterval - 2 * numDocs - 1, tp.getPayloadLength());
 294
 295         /*
 296          * Test multiple call of getPayload()
 297          */
 298         tp.getPayload(null, 0);
 299         try {
 300             // it is forbidden to call getPayload() more than once
 301             // without calling nextPosition()
 302             tp.getPayload(null, 0);
 303             fail("Expected exception not thrown");
 304         } catch (Exception expected) {
 305             // expected exception
 306         }
 307
 308         reader.close();
 309
 310         // test long payload
 311         analyzer = new PayloadAnalyzer();
 312         writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT,
 313             analyzer).setOpenMode(OpenMode.CREATE));
 314         String singleTerm = "lucene";
 315
 316         d = new Document();
 317         d.add(newField(fieldName, singleTerm, Field.Store.NO, Field.Index.ANALYZED));
 318         // add a payload whose length is greater than the buffer size of BufferedIndexOutput
 319         payloadData = generateRandomData(2000);
 320         analyzer.setPayloadData(fieldName, payloadData, 100, 1500);
 321         writer.addDocument(d);
 322
 323
 324         writer.optimize();
 325         // flush
 326         writer.close();
 327
 328         reader = IndexReader.open(dir, true);
 329         tp = reader.termPositions(new Term(fieldName, singleTerm));
 330         tp.next();
 331         tp.nextPosition();
 332
 333         verifyPayloadData = new byte[tp.getPayloadLength()];
 334         tp.getPayload(verifyPayloadData, 0);
 335         byte[] portion = new byte[1500];
 336         System.arraycopy(payloadData, 100, portion, 0, 1500);
 337
 338         assertByteArrayEquals(portion, verifyPayloadData);
 339         reader.close();
 340
 341     }
 342
 343     private void generateRandomData(byte[] data) {
 344         random.nextBytes(data);
 345     }
 346
 347     private byte[] generateRandomData(int n) {
 348         byte[] data = new byte[n];
 349         generateRandomData(data);
 350         return data;
 351     }
 352
 353     private Term[] generateTerms(String fieldName, int n) {
 354         int maxDigits = (int) (Math.log(n) / Math.log(10));
 355         Term[] terms = new Term[n];
 356         StringBuilder sb = new StringBuilder();
 357         for (int i = 0; i < n; i++) {
 358             sb.setLength(0);
 359             sb.append("t");
 360             int zeros = maxDigits - (int) (Math.log(i) / Math.log(10));
 361             for (int j = 0; j < zeros; j++) {
 362                 sb.append("0");
 363             }
 364             sb.append(i);
 365             terms[i] = new Term(fieldName, sb.toString());
 366         }
 367         return terms;
 368     }
 369
 370
 371     void assertByteArrayEquals(byte[] b1, byte[] b2) {
 372         if (b1.length != b2.length) {
 373           fail("Byte arrays have different lengths: " + b1.length + ", " + b2.length);
 374         }
 375
 376         for (int i = 0; i < b1.length; i++) {
 377           if (b1[i] != b2[i]) {
 378             fail("Byte arrays different at index " + i + ": " + b1[i] + ", " + b2[i]);
 379           }
 380         }
 381       }
 382
 383
 384     /**
 385      * This Analyzer uses an WhitespaceTokenizer and PayloadFilter.
 386      */
 387     private static class PayloadAnalyzer extends Analyzer {
 388         Map<String,PayloadData> fieldToData = new HashMap<String,PayloadData>();
 389
 390         void setPayloadData(String field, byte[] data, int offset, int length) {
 391             fieldToData.put(field, new PayloadData(0, data, offset, length));
 392         }
 393
 394         void setPayloadData(String field, int numFieldInstancesToSkip, byte[] data, int offset, int length) {
 395             fieldToData.put(field, new PayloadData(numFieldInstancesToSkip, data, offset, length));
 396         }
 397
 398         @Override
 399         public TokenStream tokenStream(String fieldName, Reader reader) {
 400             PayloadData payload =  fieldToData.get(fieldName);
 401             TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
 402             if (payload != null) {
 403                 if (payload.numFieldInstancesToSkip == 0) {
 404                     ts = new PayloadFilter(ts, payload.data, payload.offset, payload.length);
 405                 } else {
 406                     payload.numFieldInstancesToSkip--;
 407                 }
 408             }
 409             return ts;
 410         }
 411
 412         private static class PayloadData {
 413             byte[] data;
 414             int offset;
 415             int length;
 416             int numFieldInstancesToSkip;
 417
 418             PayloadData(int skip, byte[] data, int offset, int length) {
 419                 numFieldInstancesToSkip = skip;
 420                 this.data = data;
 421                 this.offset = offset;
 422                 this.length = length;
 423             }
 424         }
 425     }
 426
 427
 428     /**
 429      * This Filter adds payloads to the tokens.
 430      */
 431     private static class PayloadFilter extends TokenFilter {
 432         private byte[] data;
 433         private int length;
 434         private int offset;
 435         Payload payload = new Payload();
 436         PayloadAttribute payloadAtt;
 437
 438         public PayloadFilter(TokenStream in, byte[] data, int offset, int length) {
 439             super(in);
 440             this.data = data;
 441             this.length = length;
 442             this.offset = offset;
 443             payloadAtt = addAttribute(PayloadAttribute.class);
 444         }
 445
 446         @Override
 447         public boolean incrementToken() throws IOException {
 448             boolean hasNext = input.incrementToken();
 449             if (hasNext) {
 450                 if (offset + length <= data.length) {
 451                     Payload p = new Payload();
 452                     payloadAtt.setPayload(p);
 453                     p.setData(data, offset, length);
 454                     offset += length;
 455                 } else {
 456                     payloadAtt.setPayload(null);
 457                 }
 458             }
 459
 460             return hasNext;
 461         }
 462     }
 463
 464     public void testThreadSafety() throws Exception {
 465         final int numThreads = 5;
 466         final int numDocs = atLeast(50);
 467         final ByteArrayPool pool = new ByteArrayPool(numThreads, 5);
 468
 469         Directory dir = newDirectory();
 470         final IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(
 471             TEST_VERSION_CURRENT, new MockAnalyzer(random)));
 472         final String field = "test";
 473
 474         Thread[] ingesters = new Thread[numThreads];
 475         for (int i = 0; i < numThreads; i++) {
 476             ingesters[i] = new Thread() {
 477                 @Override
 478                 public void run() {
 479                     try {
 480                         for (int j = 0; j < numDocs; j++) {
 481                             Document d = new Document();
 482                             d.add(new Field(field, new PoolingPayloadTokenStream(pool)));
 483                             writer.addDocument(d);
 484                         }
 485                     } catch (Exception e) {
 486                         e.printStackTrace();
 487                         fail(e.toString());
 488                     }
 489                 }
 490             };
 491             ingesters[i].start();
 492         }
 493
 494         for (int i = 0; i < numThreads; i++) {
 495           ingesters[i].join();
 496         }
 497         writer.close();
 498         IndexReader reader = IndexReader.open(dir, true);
 499         TermEnum terms = reader.terms();
 500         while (terms.next()) {
 501             TermPositions tp = reader.termPositions(terms.term());
 502             while(tp.next()) {
 503                 int freq = tp.freq();
 504                 for (int i = 0; i < freq; i++) {
 505                     tp.nextPosition();
 506                     assertEquals(pool.bytesToString(tp.getPayload(new byte[5], 0)), terms.term().text);
 507                 }
 508             }
 509             tp.close();
 510         }
 511         terms.close();
 512         reader.close();
 513         dir.close();
 514         assertEquals(pool.size(), numThreads);
 515     }
 516
 517     private class PoolingPayloadTokenStream extends TokenStream {
 518         private byte[] payload;
 519         private boolean first;
 520         private ByteArrayPool pool;
 521         private String term;
 522
 523         CharTermAttribute termAtt;
 524         PayloadAttribute payloadAtt;
 525
 526         PoolingPayloadTokenStream(ByteArrayPool pool) {
 527             this.pool = pool;
 528             payload = pool.get();
 529             generateRandomData(payload);
 530             term = pool.bytesToString(payload);
 531             first = true;
 532             payloadAtt = addAttribute(PayloadAttribute.class);
 533             termAtt = addAttribute(CharTermAttribute.class);
 534         }
 535
 536         @Override
 537         public boolean incrementToken() throws IOException {
 538             if (!first) return false;
 539             first = false;
 540             clearAttributes();
 541             termAtt.append(term);
 542             payloadAtt.setPayload(new Payload(payload));
 543             return true;
 544         }
 545
 546         @Override
 547         public void close() throws IOException {
 548             pool.release(payload);
 549         }
 550
 551     }
 552
 553     private static class ByteArrayPool {
 554         private List<byte[]> pool;
 555
 556         ByteArrayPool(int capacity, int size) {
 557             pool = new ArrayList<byte[]>();
 558             for (int i = 0; i < capacity; i++) {
 559                 pool.add(new byte[size]);
 560             }
 561         }
 562
 563         private UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result();
 564
 565         synchronized String bytesToString(byte[] bytes) {
 566             String s = new String(bytes);
 567             UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8Result);
 568             try {
 569                 return new String(utf8Result.result, 0, utf8Result.length, "UTF-8");
 570             } catch (UnsupportedEncodingException uee) {
 571                 return null;
 572             }
 573         }
 574
 575         synchronized byte[] get() {
 576             return pool.remove(0);
 577         }
 578
 579         synchronized void release(byte[] b) {
 580             pool.add(b);
 581         }
 582
 583         synchronized int size() {
 584             return pool.size();
 585         }
 586     }
 587
 588   public void testAcrossFields() throws Exception {
 589     Directory dir = newDirectory();
 590     RandomIndexWriter writer = new RandomIndexWriter(random, dir,
 591                                                      new MockAnalyzer(random, MockTokenizer.WHITESPACE, true));
 592     Document doc = new Document();
 593     doc.add(new Field("hasMaybepayload", "here we go", Field.Store.YES, Field.Index.ANALYZED));
 594     writer.addDocument(doc);
 595     writer.close();
 596
 597     writer = new RandomIndexWriter(random, dir,
 598                                    new MockAnalyzer(random, MockTokenizer.WHITESPACE, true));
 599     doc = new Document();
 600     doc.add(new Field("hasMaybepayload2", "here we go", Field.Store.YES, Field.Index.ANALYZED));
 601     writer.addDocument(doc);
 602     writer.addDocument(doc);
 603     writer.optimize();
 604     writer.close();
 605
 606     dir.close();
 607   }
 608 }