sampleSize
values from the first collectionSize
- * locations of collection
, chosen using
- * the TRAVERSAL
algorithm. The sample values are not sorted.
- * @param collection The values from which a sample is wanted.
- * @param collectionSize The number of values (from the first) from which to draw the sample.
- * @param sampleSize The number of values to return.
- * @return An array of values chosen from the collection.
- * @see Algorithm#TRAVERSAL
- */
- public static int[] repeatableSample(ScoredDocIDs collection,
- int collectionSize, int sampleSize)
- throws IOException {
- return RandomSample.repeatableSample(collection, collectionSize,
- sampleSize, Algorithm.HASHING, Sorted.NO);
- }
-
- /**
- * Returns sampleSize
values from the first collectionSize
- * locations of collection
, chosen using algorithm
.
- * @param collection The values from which a sample is wanted.
- * @param collectionSize The number of values (from the first) from which to draw the sample.
- * @param sampleSize The number of values to return.
- * @param algorithm Which algorithm to use.
- * @param sorted Sorted.YES to sort the sample values in ascending order before returning;
- * Sorted.NO to return them in essentially random order.
- * @return An array of values chosen from the collection.
- */
- public static int[] repeatableSample(ScoredDocIDs collection,
- int collectionSize, int sampleSize,
- Algorithm algorithm, Sorted sorted)
- throws IOException {
- if (collection == null) {
- throw new IOException("docIdSet is null");
- }
- if (sampleSize < 1) {
- throw new IOException("sampleSize < 1 (" + sampleSize + ")");
- }
- if (collectionSize < sampleSize) {
- throw new IOException("collectionSize (" + collectionSize + ") less than sampleSize (" + sampleSize + ")");
- }
- int[] sample = new int[sampleSize];
- long[] times = new long[4];
- if (algorithm == Algorithm.TRAVERSAL) {
- RandomSample.sample1(collection, collectionSize, sample, times);
- } else if (algorithm == Algorithm.HASHING) {
- RandomSample.sample2(collection, collectionSize, sample, times);
- } else {
- throw new IllegalArgumentException("Invalid algorithm selection");
- }
- if (sorted == Sorted.YES) {
- Arrays.sort(sample);
- }
- if (RandomSample.returnTimings) {
- times[3] = System.currentTimeMillis();
- if (logger.isLoggable(Level.FINEST)) {
- logger.finest("Times: " + (times[1] - times[0]) + "ms, "
- + (times[2] - times[1]) + "ms, " + (times[3] - times[2])+"ms");
- }
- }
- return sample;
- }
-
- /**
- * Returns sample
.length values chosen from the first collectionSize
- * locations of collection
, using the TRAVERSAL algorithm. The sample is
- * pseudorandom: no subset of the original collection
- * is in principle more likely to occur than any other, but for a given collection
- * and sample size, the same sample will always be returned. This algorithm walks the
- * original collection in a methodical way that is guaranteed not to visit any location
- * more than once, which makes sampling without replacement faster because removals don't
- * have to be tracked, and the number of operations is proportional to the sample size,
- * not the collection size.
- * Times for performance measurement
- * are returned in times
, which must be an array of at least three longs, containing
- * nanosecond event times. The first
- * is set when the algorithm starts; the second, when the step size has been calculated;
- * and the third when the sample has been taken.
- * @param collection The set to be sampled.
- * @param collectionSize The number of values to use (starting from first).
- * @param sample The array in which to return the sample.
- * @param times The times of three events, for measuring performance.
- */
- private static void sample1(ScoredDocIDs collection, int collectionSize, int[] sample, long[] times)
- throws IOException {
- ScoredDocIDsIterator it = collection.iterator();
- if (RandomSample.returnTimings) {
- times[0] = System.currentTimeMillis();
- }
- int sampleSize = sample.length;
- int prime = RandomSample.findGoodStepSize(collectionSize, sampleSize);
- int mod = prime % collectionSize;
- if (RandomSample.returnTimings) {
- times[1] = System.currentTimeMillis();
- }
- int sampleCount = 0;
- int index = 0;
- for (; sampleCount < sampleSize;) {
- if (index + mod < collectionSize) {
- for (int i = 0; i < mod; i++, index++) {
- it.next();
- }
- } else {
- index = index + mod - collectionSize;
- it = collection.iterator();
- for (int i = 0; i < index; i++) {
- it.next();
- }
- }
- sample[sampleCount++] = it.getDocID();
- }
- if (RandomSample.returnTimings) {
- times[2] = System.currentTimeMillis();
- }
- } // end RandomSample.sample1()
-
- /**
- * Returns a value which will allow the caller to walk
- * a collection of collectionSize
values, without repeating or missing
- * any, and spanning the collection from beginning to end at least once with
- * sampleSize
visited locations. Choosing a value
- * that is relatively prime to the collection size ensures that stepping by that size (modulo
- * the collection size) will hit all locations without repeating, eliminating the need to
- * track previously visited locations for a "without replacement" sample. Starting with the
- * square root of the collection size ensures that either the first or second prime tried will
- * work (they can't both divide the collection size). It also has the property that N steps of
- * size N will span a collection of N**2 elements once. If the sample is bigger than N, it will
- * wrap multiple times (without repeating). If the sample is smaller, a step size is chosen
- * that will result in at least one spanning of the collection.
- *
- * @param collectionSize The number of values in the collection to be sampled.
- * @param sampleSize The number of values wanted in the sample.
- * @return A good increment value for walking the collection.
- */
- private static int findGoodStepSize(int collectionSize, int sampleSize) {
- int i = (int) Math.sqrt(collectionSize);
- if (sampleSize < i) {
- i = collectionSize / sampleSize;
- }
- do {
- i = RandomSample.findNextPrimeAfter(i);
- } while (collectionSize % i == 0);
- return i;
- } // end RandomSample.findGoodStepSize()
-
- /**
- * Returns the first prime number that is larger than n
.
- * @param n A number less than the prime to be returned.
- * @return The smallest prime larger than n
.
- */
- private static int findNextPrimeAfter(int n) {
- n += (n % 2 == 0) ? 1 : 2; // next odd
- foundFactor: for (;; n += 2) {
- int sri = (int) (Math.sqrt(n));
- for (int primeIndex = 0; primeIndex < RandomSample.N_PRIMES; primeIndex++) {
- int p = RandomSample.primes[primeIndex];
- if (p > sri) {
- return n;
- }
- if (n % p == 0) {
- continue foundFactor;
- }
- }
- for (int p = RandomSample.primes[RandomSample.N_PRIMES - 1] + 2;; p += 2) {
- if (p > sri) {
- return n;
- }
- if (n % p == 0) {
- continue foundFactor;
- }
- }
- }
- } // end RandomSample.findNextPrimeAfter()
-
- /**
- * Divides the values in collection
into numSubranges
- * subranges from minValue
to maxValue
and returns the
- * number of values in each subrange. (For testing the flatness of distribution of
- * a sample.)
- * @param collection The collection of values to be counted.
- * @param range The number of possible values.
- * @param numSubranges How many intervals to divide the value range into.
- */
- private static int[] countsBySubrange(int[] collection, int range, int numSubranges) {
- int[] counts = new int[numSubranges];
- Arrays.fill(counts, 0);
- int numInSubrange = range / numSubranges;
- for (int j = 0; j < collection.length; j++) {
- counts[collection[j] / numInSubrange]++;
- }
- return counts;
- } // end RandomSample.countsBySubrange()
-
- /**
- * Factors value
into primes.
- */
- public static int[] factor(long value) {
- ArrayListsample
.length values chosen from the first collectionSize
- * locations of collection
, using the HASHING algorithm. Performance measurements
- * are returned in times
, which must be an array of at least three longs. The first
- * will be set when the algorithm starts; the second, when a hash key has been calculated and
- * inserted into the priority queue for every element in the collection; and the third when the
- * original elements associated with the keys remaining in the PQ have been stored in the sample
- * array for return.
- * - * This algorithm slows as the sample size becomes a significant fraction of the collection - * size, because the PQ is as large as the sample set, and will not do early rejection of values - * below the minimum until it fills up, and a larger PQ contains more small values to be purged, - * resulting in less early rejection and more logN insertions. - * - * @param collection The set to be sampled. - * @param collectionSize The number of values to use (starting from first). - * @param sample The array in which to return the sample. - * @param times The times of three events, for measuring performance. - */ - private static void sample2(ScoredDocIDs collection, int collectionSize, int[] sample, long[] times) - throws IOException { - if (RandomSample.returnTimings) { - times[0] = System.currentTimeMillis(); - } - int sampleSize = sample.length; - IntPriorityQueue pq = new IntPriorityQueue(sampleSize); - /* - * Convert every value in the collection to a hashed "weight" value, and insert - * into a bounded PQ (retains only sampleSize highest weights). - */ - ScoredDocIDsIterator it = collection.iterator(); - while (it.next()) { - pq.insertWithReuse((int)(it.getDocID() * PHI_32) & 0x7FFFFFFF); - } - if (RandomSample.returnTimings) { - times[1] = System.currentTimeMillis(); - } - /* - * Extract heap, convert weights back to original values, and return as integers. - */ - Object[] heap = pq.getHeap(); - for (int si = 0; si < sampleSize; si++) { - sample[si] = (int)(((IntPriorityQueue.MI)(heap[si+1])).value * PHI_32I) & 0x7FFFFFFF; - } - if (RandomSample.returnTimings) { - times[2] = System.currentTimeMillis(); - } - } // end RandomSample.sample2() - - /** - * A bounded priority queue for Integers, to retain a specified number of - * the highest-weighted values for return as a random sample. - */ - private static class IntPriorityQueue extends PriorityQueue