lucene-java-3.4.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/util/RandomSample.java

   1 package org.apache.lucene.facet.util;
   2
   3 import java.io.IOException;
   4 import java.util.ArrayList;
   5 import java.util.Arrays;
   6 import java.util.logging.Level;
   7 import java.util.logging.Logger;
   8
   9 import org.apache.lucene.analysis.KeywordAnalyzer;
  10 import org.apache.lucene.document.Document;
  11 import org.apache.lucene.index.CorruptIndexException;
  12 import org.apache.lucene.index.IndexReader;
  13 import org.apache.lucene.index.IndexWriter;
  14 import org.apache.lucene.index.IndexWriterConfig;
  15 import org.apache.lucene.store.Directory;
  16 import org.apache.lucene.store.LockObtainFailedException;
  17 import org.apache.lucene.store.RAMDirectory;
  18 import org.apache.lucene.util.PriorityQueue;
  19 import org.apache.lucene.util.Version;
  20
  21 import org.apache.lucene.facet.search.ScoredDocIDs;
  22 import org.apache.lucene.facet.search.ScoredDocIDsIterator;
  23
  24 /**
  25  * Licensed to the Apache Software Foundation (ASF) under one or more
  26  * contributor license agreements.  See the NOTICE file distributed with
  27  * this work for additional information regarding copyright ownership.
  28  * The ASF licenses this file to You under the Apache License, Version 2.0
  29  * (the "License"); you may not use this file except in compliance with
  30  * the License.  You may obtain a copy of the License at
  31  *
  32  *     http://www.apache.org/licenses/LICENSE-2.0
  33  *
  34  * Unless required by applicable law or agreed to in writing, software
  35  * distributed under the License is distributed on an "AS IS" BASIS,
  36  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  37  * See the License for the specific language governing permissions and
  38  * limitations under the License.
  39  */
  40
  41 /**
  42  * Take random samples of large collections.
  43  *
  44  * @lucene.experimental
  45  */
  46 public class RandomSample {
  47
  48   private static final Logger logger = Logger.getLogger(RandomSample.class.getName());
  49
  50   /**
  51    * Returns <code>sampleSize</code> values from the first <code>collectionSize</code>
  52    * locations of <code>collection</code>, chosen using
  53    * the <code>TRAVERSAL</code> algorithm. The sample values are not sorted.
  54    * @param collection The values from which a sample is wanted.
  55    * @param collectionSize The number of values (from the first) from which to draw the sample.
  56    * @param sampleSize The number of values to return.
  57    * @return An array of values chosen from the collection.
  58    * @see Algorithm#TRAVERSAL
  59    */
  60   public static int[] repeatableSample(ScoredDocIDs collection,
  61       int collectionSize, int sampleSize)
  62   throws IOException {
  63     return RandomSample.repeatableSample(collection, collectionSize,
  64         sampleSize, Algorithm.HASHING, Sorted.NO);
  65   }
  66
  67   /**
  68    * Returns <code>sampleSize</code> values from the first <code>collectionSize</code>
  69    * locations of <code>collection</code>, chosen using <code>algorithm</code>.
  70    * @param collection The values from which a sample is wanted.
  71    * @param collectionSize The number of values (from the first) from which to draw the sample.
  72    * @param sampleSize The number of values to return.
  73    * @param algorithm Which algorithm to use.
  74    * @param sorted Sorted.YES to sort the sample values in ascending order before returning;
  75    * Sorted.NO to return them in essentially random order.
  76    * @return An array of values chosen from the collection.
  77    */
  78   public static int[] repeatableSample(ScoredDocIDs collection,
  79       int collectionSize, int sampleSize,
  80       Algorithm algorithm, Sorted sorted)
  81   throws IOException {
  82     if (collection == null) {
  83       throw new IOException("docIdSet is null");
  84     }
  85     if (sampleSize < 1) {
  86       throw new IOException("sampleSize < 1 (" + sampleSize + ")");
  87     }
  88     if (collectionSize < sampleSize) {
  89       throw new IOException("collectionSize (" + collectionSize + ") less than sampleSize (" + sampleSize + ")");
  90     }
  91     int[] sample = new int[sampleSize];
  92     long[] times = new long[4];
  93     if (algorithm == Algorithm.TRAVERSAL) {
  94       RandomSample.sample1(collection, collectionSize, sample, times);
  95     } else if (algorithm == Algorithm.HASHING) {
  96       RandomSample.sample2(collection, collectionSize, sample, times);
  97     } else {
  98       throw new IllegalArgumentException("Invalid algorithm selection");
  99     }
 100     if (sorted == Sorted.YES) {
 101       Arrays.sort(sample);
 102     }
 103     if (RandomSample.returnTimings) {
 104       times[3] = System.currentTimeMillis();
 105       if (logger.isLoggable(Level.FINEST)) {
 106         logger.finest("Times: " + (times[1] - times[0]) + "ms, "
 107             + (times[2] - times[1]) + "ms, " + (times[3] - times[2])+"ms");
 108       }
 109     }
 110     return sample;
 111   }
 112
 113   /**
 114    * Returns <code>sample</code>.length values chosen from the first <code>collectionSize</code>
 115    * locations of <code>collection</code>, using the TRAVERSAL algorithm. The sample is
 116    * pseudorandom: no subset of the original collection
 117    * is in principle more likely to occur than any other, but for a given collection
 118    * and sample size, the same sample will always be returned. This algorithm walks the
 119    * original collection in a methodical way that is guaranteed not to visit any location
 120    * more than once, which makes sampling without replacement faster because removals don't
 121    * have to be tracked, and the number of operations is proportional to the sample size,
 122    * not the collection size.
 123    * Times for performance measurement
 124    * are returned in <code>times</code>, which must be an array of at least three longs, containing
 125    * nanosecond event times. The first
 126    * is set when the algorithm starts; the second, when the step size has been calculated;
 127    * and the third when the sample has been taken.
 128    * @param collection The set to be sampled.
 129    * @param collectionSize The number of values to use (starting from first).
 130    * @param sample The array in which to return the sample.
 131    * @param times The times of three events, for measuring performance.
 132    */
 133   private static void sample1(ScoredDocIDs collection, int collectionSize, int[] sample, long[] times)
 134   throws IOException {
 135     ScoredDocIDsIterator it = collection.iterator();
 136     if (RandomSample.returnTimings) {
 137       times[0] = System.currentTimeMillis();
 138     }
 139     int sampleSize = sample.length;
 140     int prime = RandomSample.findGoodStepSize(collectionSize, sampleSize);
 141     int mod = prime % collectionSize;
 142     if (RandomSample.returnTimings) {
 143       times[1] = System.currentTimeMillis();
 144     }
 145     int sampleCount = 0;
 146     int index = 0;
 147     for (; sampleCount < sampleSize;) {
 148       if (index + mod < collectionSize) {
 149         for (int i = 0; i < mod; i++, index++) {
 150           it.next();
 151         }
 152       } else {
 153         index = index + mod - collectionSize;
 154         it = collection.iterator();
 155         for (int i = 0; i < index; i++) {
 156           it.next();
 157         }
 158       }
 159       sample[sampleCount++] = it.getDocID();
 160     }
 161     if (RandomSample.returnTimings) {
 162       times[2] = System.currentTimeMillis();
 163     }
 164   } // end RandomSample.sample1()
 165
 166   /**
 167    * Returns a value which will allow the caller to walk
 168    * a collection of <code>collectionSize</code> values, without repeating or missing
 169    * any, and spanning the collection from beginning to end at least once with
 170    * <code>sampleSize</code> visited locations. Choosing a value
 171    * that is relatively prime to the collection size ensures that stepping by that size (modulo
 172    * the collection size) will hit all locations without repeating, eliminating the need to
 173    * track previously visited locations for a "without replacement" sample. Starting with the
 174    * square root of the collection size ensures that either the first or second prime tried will
 175    * work (they can't both divide the collection size). It also has the property that N steps of
 176    * size N will span a collection of N**2 elements once. If the sample is bigger than N, it will
 177    * wrap multiple times (without repeating). If the sample is smaller, a step size is chosen
 178    * that will result in at least one spanning of the collection.
 179    *
 180    * @param collectionSize The number of values in the collection to be sampled.
 181    * @param sampleSize The number of values wanted in the sample.
 182    * @return A good increment value for walking the collection.
 183    */
 184   private static int findGoodStepSize(int collectionSize, int sampleSize) {
 185     int i = (int) Math.sqrt(collectionSize);
 186     if (sampleSize < i) {
 187       i = collectionSize / sampleSize;
 188     }
 189     do {
 190       i = RandomSample.findNextPrimeAfter(i);
 191     } while (collectionSize % i == 0);
 192     return i;
 193   } // end RandomSample.findGoodStepSize()
 194
 195   /**
 196    * Returns the first prime number that is larger than <code>n</code>.
 197    * @param n A number less than the prime to be returned.
 198    * @return The smallest prime larger than <code>n</code>.
 199    */
 200   private static int findNextPrimeAfter(int n) {
 201     n += (n % 2 == 0) ? 1 : 2; // next odd
 202     foundFactor: for (;; n += 2) {
 203       int sri = (int) (Math.sqrt(n));
 204       for (int primeIndex = 0; primeIndex < RandomSample.N_PRIMES; primeIndex++) {
 205         int p = RandomSample.primes[primeIndex];
 206         if (p > sri) {
 207           return n;
 208         }
 209         if (n % p == 0) {
 210           continue foundFactor;
 211         }
 212       }
 213       for (int p = RandomSample.primes[RandomSample.N_PRIMES - 1] + 2;; p += 2) {
 214         if (p > sri) {
 215           return n;
 216         }
 217         if (n % p == 0) {
 218           continue foundFactor;
 219         }
 220       }
 221     }
 222   } // end RandomSample.findNextPrimeAfter()
 223
 224   /**
 225    * Divides the values in <code>collection</code> into <code>numSubranges</code>
 226    * subranges from <code>minValue</code> to <code>maxValue</code> and returns the
 227    * number of values in each subrange. (For testing the flatness of distribution of
 228    * a sample.)
 229    * @param collection The collection of values to be counted.
 230    * @param range The number of possible values.
 231    * @param numSubranges How many intervals to divide the value range into.
 232    */
 233   private static int[] countsBySubrange(int[] collection, int range, int numSubranges) {
 234     int[] counts = new int[numSubranges];
 235     Arrays.fill(counts, 0);
 236     int numInSubrange = range / numSubranges;
 237     for (int j = 0; j < collection.length; j++) {
 238       counts[collection[j] / numInSubrange]++;
 239     }
 240     return counts;
 241   } // end RandomSample.countsBySubrange()
 242
 243   /**
 244    * Factors <code>value</code> into primes.
 245    */
 246   public static int[] factor(long value) {
 247     ArrayList<Integer> list = new ArrayList<Integer>();
 248     while (value > 1 && value % 2 == 0) {
 249       list.add(2);
 250       value /= 2;
 251     }
 252     long sqrt = Math.round(Math.sqrt(value));
 253     for (int pIndex = 0, lim = RandomSample.primes.length; pIndex < lim; pIndex++) {
 254       int p = RandomSample.primes[pIndex];
 255       if (p >= sqrt) {
 256         break;
 257       }
 258       while (value % p == 0) {
 259         list.add(p);
 260         value /= p;
 261         sqrt = Math.round(Math.sqrt(value));
 262       }
 263     }
 264     if (list.size() == 0 || value > Integer.MAX_VALUE) {
 265       throw new RuntimeException("Prime or too large to factor: "+value);
 266     }
 267     if ((int)value > 1) {
 268       list.add((int)value);
 269     }
 270     int[] factors = new int[list.size()];
 271     for (int j = 0; j < factors.length; j++) {
 272       factors[j] = list.get(j).intValue();
 273     }
 274     return factors;
 275   } // end RandomSample.factor()
 276
 277   /**
 278    * The first N_PRIMES primes, after 2.
 279    */
 280   private static final int N_PRIMES = 4000;
 281   private static int[] primes = new int[RandomSample.N_PRIMES];
 282   static {
 283     RandomSample.primes[0] = 3;
 284     for (int count = 1; count < RandomSample.N_PRIMES; count++) {
 285       primes[count] = RandomSample.findNextPrimeAfter(primes[count - 1]);
 286     }
 287   }
 288
 289   /**
 290    * Returns <code>sample</code>.length values chosen from the first <code>collectionSize</code>
 291    * locations of <code>collection</code>, using the HASHING algorithm. Performance measurements
 292    * are returned in <code>times</code>, which must be an array of at least three longs. The first
 293    * will be set when the algorithm starts; the second, when a hash key has been calculated and
 294    * inserted into the priority queue for every element in the collection; and the third when the
 295    * original elements associated with the keys remaining in the PQ have been stored in the sample
 296    * array for return.
 297    * <P>
 298    * This algorithm slows as the sample size becomes a significant fraction of the collection
 299    * size, because the PQ is as large as the sample set, and will not do early rejection of values
 300    * below the minimum until it fills up, and a larger PQ contains more small values to be purged,
 301    * resulting in less early rejection and more logN insertions.
 302    *
 303    * @param collection The set to be sampled.
 304    * @param collectionSize The number of values to use (starting from first).
 305    * @param sample The array in which to return the sample.
 306    * @param times The times of three events, for measuring performance.
 307    */
 308   private static void sample2(ScoredDocIDs collection, int collectionSize, int[] sample, long[] times)
 309   throws IOException {
 310     if (RandomSample.returnTimings) {
 311       times[0] = System.currentTimeMillis();
 312     }
 313     int sampleSize = sample.length;
 314     IntPriorityQueue pq = new IntPriorityQueue(sampleSize);
 315     /*
 316      * Convert every value in the collection to a hashed "weight" value, and insert
 317      * into a bounded PQ (retains only sampleSize highest weights).
 318      */
 319     ScoredDocIDsIterator it = collection.iterator();
 320     while (it.next()) {
 321       pq.insertWithReuse((int)(it.getDocID() * PHI_32) & 0x7FFFFFFF);
 322     }
 323     if (RandomSample.returnTimings) {
 324       times[1] = System.currentTimeMillis();
 325     }
 326     /*
 327      * Extract heap, convert weights back to original values, and return as integers.
 328      */
 329     Object[] heap = pq.getHeap();
 330     for (int si = 0; si < sampleSize; si++) {
 331       sample[si] = (int)(((IntPriorityQueue.MI)(heap[si+1])).value * PHI_32I) & 0x7FFFFFFF;
 332     }
 333     if (RandomSample.returnTimings) {
 334       times[2] = System.currentTimeMillis();
 335     }
 336   } // end RandomSample.sample2()
 337
 338   /**
 339    * A bounded priority queue for Integers, to retain a specified number of
 340    * the highest-weighted values for return as a random sample.
 341    */
 342   private static class IntPriorityQueue extends PriorityQueue<Object> {
 343
 344     /**
 345      * Creates a bounded PQ of size <code>size</code>.
 346      * @param size The number of elements to retain.
 347      */
 348     public IntPriorityQueue(int size) {
 349       super();
 350       this.initialize(size);
 351     }
 352
 353     /**
 354      * Inserts an integer with overflow and object reuse.
 355      */
 356     public void insertWithReuse(int intval) {
 357       if (this.mi == null) {
 358         this.mi = new MI();
 359       }
 360       this.mi.value = intval;
 361       this.mi = (MI)this.insertWithOverflow(this.mi);
 362     } // end IntPriorityQueue.insertWithReuse()
 363
 364     /**
 365      * Returns the underlying data structure for faster access. Extracting elements
 366      * one at a time would require N logN time, and since we want the elements sorted
 367      * in ascending order by value (not weight), the array is useful as-is.
 368      * @return The underlying heap array.
 369      */
 370     public Object[] getHeap() {
 371       return getHeapArray();
 372     }
 373
 374     /**
 375      * Returns true if <code>o1<code>'s weight is less than that of <code>o2</code>, for
 376      * ordering in the PQ.
 377      * @return True if <code>o1</code> weighs less than <code>o2</code>.
 378      */
 379     @Override
 380     public boolean lessThan(Object o1, Object o2) {
 381       return ((MI)o1).value < ((MI)o2).value;
 382     }
 383
 384     /**
 385      * A mutable integer that lets queue objects be reused once they start overflowing.
 386      */
 387     private static class MI {
 388       MI() { }
 389       public int value;
 390     } // end class RandomSample.IntPriorityQueue.MI
 391
 392     /**
 393      * The mutable integer instance for reuse after first overflow.
 394      */
 395     private MI mi;
 396
 397   } // end class RandomSample.IntPriorityQueue
 398
 399   /**
 400    * For specifying which sampling algorithm to use.
 401    */
 402   public static class Algorithm {
 403
 404     /**
 405      * Specifies a methodical traversal algorithm, which is guaranteed to span the collection
 406      * at least once, and never to return duplicates. Faster than the hashing algorithm and
 407      * uses much less space, but the randomness of the sample may be affected by systematic
 408      * variations in the collection. Requires only an array for the sample, and visits only
 409      * the number of elements in the sample set, not the full set.
 410      */
 411     // TODO (Facet): This one produces a bimodal distribution (very flat around
 412     // each peak!) for collection size 10M and sample sizes 10k and 10544.
 413     // Figure out why.
 414     public static final Algorithm TRAVERSAL = new Algorithm("Traversal");
 415
 416     /**
 417      * Specifies a Fibonacci-style hash algorithm (see Knuth, S&S), which generates a less
 418      * systematically distributed subset of the sampled collection than the traversal method,
 419      * but requires a bounded priority queue the size of the sample, and creates an object
 420      * containing a sampled value and its hash, for every element in the full set.
 421      */
 422     public static final Algorithm HASHING = new Algorithm("Hashing");
 423
 424     /**
 425      * Constructs an instance of an algorithm.
 426      * @param name An ID for printing.
 427      */
 428     private Algorithm(String name) {
 429       this.name = name;
 430     }
 431
 432     /**
 433      * Prints this algorithm's name.
 434      */
 435     @Override
 436     public String toString() {
 437       return this.name;
 438     }
 439
 440     /**
 441      * The name of this algorithm, for printing.
 442      */
 443     private String name;
 444
 445   } // end class RandomSample.Algorithm
 446
 447   /**
 448    * For specifying whether to sort the sample.
 449    */
 450   public static class Sorted {
 451
 452     /**
 453      * Specifies sorting the resulting sample before returning.
 454      */
 455     public static final Sorted YES = new Sorted("sorted");
 456
 457     /**
 458      * Specifies not sorting the resulting sample.
 459      */
 460     public static final Sorted NO = new Sorted("unsorted");
 461
 462     /**
 463      * Constructs an instance of a "sorted" selector.
 464      * @param name An ID for printing.
 465      */
 466     private Sorted(String name) {
 467       this.name = name;
 468     }
 469
 470     /**
 471      * Prints this selector's name.
 472      */
 473     @Override
 474     public String toString() {
 475       return this.name;
 476     }
 477
 478     /**
 479      * The name of this selector, for printing.
 480      */
 481     private String name;
 482
 483   } // end class RandomSample.Sorted
 484
 485   /**
 486    * Magic number 1: prime closest to phi, in 32 bits.
 487    */
 488   private static final long PHI_32 = 2654435769L;
 489
 490   /**
 491    * Magic number 2: multiplicative inverse of PHI_32, modulo 2**32.
 492    */
 493   private static final long PHI_32I = 340573321L;
 494
 495   /**
 496    * Switch to cause methods to return timings.
 497    */
 498   private static boolean returnTimings = false;
 499
 500   /**
 501    * Self-test.
 502    */
 503   public static void main(String[] args) throws Exception {
 504     RandomSample.returnTimings = true;
 505     /*
 506      * Create an array of sequential integers, from which samples will be taken.
 507      */
 508     final int COLLECTION_SIZE = 10 * 1000 * 1000;
 509     ScoredDocIDs collection = createAllScoredDocs(COLLECTION_SIZE);
 510
 511     /*
 512      * Factor PHI.
 513      *
 514         int[] factors = RandomSample.factor(PHI_32);
 515         System.out.print("Factors of PHI_32: ");
 516         for (int k : factors) {
 517           System.out.print(k+", ");
 518         }
 519         System.out.println("");
 520
 521      * Verify inverse relationship of PHI & phi.
 522      *
 523         boolean inverseValid = true;
 524         for (int j = 0; j < Integer.MAX_VALUE; j++) {
 525           int k = (int)(j * PHI_32) & 0x7FFFFFFF;
 526           int m = (int)(k * PHI_32I) & 0X7FFFFFFF;
 527           if (j != m) {
 528             System.out.println("Inverse not valid for "+j);
 529             inverseValid = false;
 530           }
 531         }
 532         System.out.println("Inverse valid? "+inverseValid);
 533      */
 534     /*
 535      * Take samples of various sizes from the full set, verify no duplicates,
 536      * check flatness.
 537      */
 538     int[] sampleSizes = {
 539         10, 57, 100, 333, 1000, 2154, 10000
 540     };
 541     Algorithm[] algorithms = { Algorithm.HASHING, Algorithm.TRAVERSAL };
 542     for (int sampleSize : sampleSizes) {
 543       for (Algorithm algorithm : algorithms) {
 544         System.out.println("Sample size " + sampleSize
 545             + ", algorithm " + algorithm + "...");
 546         /*
 547          * Take the sample.
 548          */
 549         int[] sample = RandomSample.repeatableSample(
 550             collection, COLLECTION_SIZE, sampleSize, algorithm, Sorted.YES);
 551         /*
 552          * Check for duplicates.
 553          */
 554         boolean noDups = true;
 555         for (int j = 0; j < sampleSize - 1; j++) {
 556           if (sample[j] == sample[j + 1]) {
 557             System.out.println("Duplicate value "
 558                 + sample[j] + " at " + j + ", "
 559                 + (j + 1));
 560             noDups = false;
 561             break;
 562           }
 563         }
 564         if (noDups) {
 565           System.out.println("No duplicates.");
 566         }
 567         if (algorithm == Algorithm.HASHING) {
 568           System.out.print("Hashed sample, up to 100 of "+sampleSize+": ");
 569           int lim = Math.min(100, sampleSize);
 570           for (int k = 0; k < lim; k++) {
 571             System.out.print(sample[k]+", ");
 572           }
 573           System.out.println("");
 574         }
 575         /*
 576          * Check flatness of distribution in sample.
 577          */
 578         final int N_INTERVALS = 100;
 579         int[] counts = RandomSample.countsBySubrange(sample, COLLECTION_SIZE, N_INTERVALS);
 580         int minCount = Integer.MAX_VALUE;
 581         int maxCount = Integer.MIN_VALUE;
 582         int avgCount = 0;
 583         for (int j = 0; j < N_INTERVALS; j++) {
 584           int count = counts[j];
 585           if (count < minCount) {
 586             minCount = count;
 587           }
 588           if (count > maxCount) {
 589             maxCount = count;
 590           }
 591           avgCount += count;
 592         }
 593         avgCount /= N_INTERVALS;
 594         System.out.println("Min, max, avg: "+minCount+", "+maxCount+", "+avgCount);
 595
 596         if (((double)minCount - avgCount)/avgCount < -0.05 && (minCount - avgCount) < -5) {
 597           System.out.println("Not flat enough.");
 598         } else if (((double)maxCount - avgCount)/avgCount > 0.05 && (maxCount - avgCount) > 5) {
 599           System.out.println("Not flat enough.");
 600         } else {
 601           System.out.println("Flat enough.");
 602         }
 603         if (sampleSize == 10544 && algorithm == Algorithm.TRAVERSAL) {
 604           System.out.print("Counts of interest: ");
 605           for (int j = 0; j < N_INTERVALS; j++) {
 606             System.out.print(counts[j]+", ");
 607           }
 608           System.out.println("");
 609         }
 610       }
 611     }
 612     System.out.println("Last prime is "
 613         + RandomSample.primes[RandomSample.N_PRIMES - 1]);
 614   }
 615
 616   private static ScoredDocIDs createAllScoredDocs(final int COLLECTION_SIZE)
 617   throws CorruptIndexException, LockObtainFailedException, IOException {
 618     ScoredDocIDs collection;
 619
 620     IndexReader reader = null;
 621     Directory ramDir = new RAMDirectory();
 622     try {
 623       IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_30, new KeywordAnalyzer()));
 624       for (int i = 0; i < COLLECTION_SIZE; i++) {
 625         writer.addDocument(new Document());
 626       }
 627       writer.commit();
 628       writer.close();
 629       reader = IndexReader.open(ramDir);
 630       collection = ScoredDocIdsUtils.createAllDocsScoredDocIDs(reader);
 631     } finally {
 632       if (reader != null) {
 633         reader.close();
 634       }
 635       ramDir.close();
 636     }
 637     return collection;
 638   }
 639 } // end class RandomSample