1 package org.apache.lucene.facet.util;
3 import java.io.IOException;
4 import java.util.ArrayList;
5 import java.util.Arrays;
6 import java.util.logging.Level;
7 import java.util.logging.Logger;
9 import org.apache.lucene.analysis.KeywordAnalyzer;
10 import org.apache.lucene.document.Document;
11 import org.apache.lucene.index.CorruptIndexException;
12 import org.apache.lucene.index.IndexReader;
13 import org.apache.lucene.index.IndexWriter;
14 import org.apache.lucene.index.IndexWriterConfig;
15 import org.apache.lucene.store.Directory;
16 import org.apache.lucene.store.LockObtainFailedException;
17 import org.apache.lucene.store.RAMDirectory;
18 import org.apache.lucene.util.PriorityQueue;
19 import org.apache.lucene.util.Version;
21 import org.apache.lucene.facet.search.ScoredDocIDs;
22 import org.apache.lucene.facet.search.ScoredDocIDsIterator;
25 * Licensed to the Apache Software Foundation (ASF) under one or more
26 * contributor license agreements. See the NOTICE file distributed with
27 * this work for additional information regarding copyright ownership.
28 * The ASF licenses this file to You under the Apache License, Version 2.0
29 * (the "License"); you may not use this file except in compliance with
30 * the License. You may obtain a copy of the License at
32 * http://www.apache.org/licenses/LICENSE-2.0
34 * Unless required by applicable law or agreed to in writing, software
35 * distributed under the License is distributed on an "AS IS" BASIS,
36 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
37 * See the License for the specific language governing permissions and
38 * limitations under the License.
42 * Take random samples of large collections.
44 * @lucene.experimental
46 public class RandomSample {
48 private static final Logger logger = Logger.getLogger(RandomSample.class.getName());
51 * Returns <code>sampleSize</code> values from the first <code>collectionSize</code>
52 * locations of <code>collection</code>, chosen using
53 * the <code>TRAVERSAL</code> algorithm. The sample values are not sorted.
54 * @param collection The values from which a sample is wanted.
55 * @param collectionSize The number of values (from the first) from which to draw the sample.
56 * @param sampleSize The number of values to return.
57 * @return An array of values chosen from the collection.
58 * @see Algorithm#TRAVERSAL
60 public static int[] repeatableSample(ScoredDocIDs collection,
61 int collectionSize, int sampleSize)
63 return RandomSample.repeatableSample(collection, collectionSize,
64 sampleSize, Algorithm.HASHING, Sorted.NO);
68 * Returns <code>sampleSize</code> values from the first <code>collectionSize</code>
69 * locations of <code>collection</code>, chosen using <code>algorithm</code>.
70 * @param collection The values from which a sample is wanted.
71 * @param collectionSize The number of values (from the first) from which to draw the sample.
72 * @param sampleSize The number of values to return.
73 * @param algorithm Which algorithm to use.
74 * @param sorted Sorted.YES to sort the sample values in ascending order before returning;
75 * Sorted.NO to return them in essentially random order.
76 * @return An array of values chosen from the collection.
78 public static int[] repeatableSample(ScoredDocIDs collection,
79 int collectionSize, int sampleSize,
80 Algorithm algorithm, Sorted sorted)
82 if (collection == null) {
83 throw new IOException("docIdSet is null");
86 throw new IOException("sampleSize < 1 (" + sampleSize + ")");
88 if (collectionSize < sampleSize) {
89 throw new IOException("collectionSize (" + collectionSize + ") less than sampleSize (" + sampleSize + ")");
91 int[] sample = new int[sampleSize];
92 long[] times = new long[4];
93 if (algorithm == Algorithm.TRAVERSAL) {
94 RandomSample.sample1(collection, collectionSize, sample, times);
95 } else if (algorithm == Algorithm.HASHING) {
96 RandomSample.sample2(collection, collectionSize, sample, times);
98 throw new IllegalArgumentException("Invalid algorithm selection");
100 if (sorted == Sorted.YES) {
103 if (RandomSample.returnTimings) {
104 times[3] = System.currentTimeMillis();
105 if (logger.isLoggable(Level.FINEST)) {
106 logger.finest("Times: " + (times[1] - times[0]) + "ms, "
107 + (times[2] - times[1]) + "ms, " + (times[3] - times[2])+"ms");
114 * Returns <code>sample</code>.length values chosen from the first <code>collectionSize</code>
115 * locations of <code>collection</code>, using the TRAVERSAL algorithm. The sample is
116 * pseudorandom: no subset of the original collection
117 * is in principle more likely to occur than any other, but for a given collection
118 * and sample size, the same sample will always be returned. This algorithm walks the
119 * original collection in a methodical way that is guaranteed not to visit any location
120 * more than once, which makes sampling without replacement faster because removals don't
121 * have to be tracked, and the number of operations is proportional to the sample size,
122 * not the collection size.
123 * Times for performance measurement
124 * are returned in <code>times</code>, which must be an array of at least three longs, containing
125 * nanosecond event times. The first
126 * is set when the algorithm starts; the second, when the step size has been calculated;
127 * and the third when the sample has been taken.
128 * @param collection The set to be sampled.
129 * @param collectionSize The number of values to use (starting from first).
130 * @param sample The array in which to return the sample.
131 * @param times The times of three events, for measuring performance.
133 private static void sample1(ScoredDocIDs collection, int collectionSize, int[] sample, long[] times)
135 ScoredDocIDsIterator it = collection.iterator();
136 if (RandomSample.returnTimings) {
137 times[0] = System.currentTimeMillis();
139 int sampleSize = sample.length;
140 int prime = RandomSample.findGoodStepSize(collectionSize, sampleSize);
141 int mod = prime % collectionSize;
142 if (RandomSample.returnTimings) {
143 times[1] = System.currentTimeMillis();
147 for (; sampleCount < sampleSize;) {
148 if (index + mod < collectionSize) {
149 for (int i = 0; i < mod; i++, index++) {
153 index = index + mod - collectionSize;
154 it = collection.iterator();
155 for (int i = 0; i < index; i++) {
159 sample[sampleCount++] = it.getDocID();
161 if (RandomSample.returnTimings) {
162 times[2] = System.currentTimeMillis();
164 } // end RandomSample.sample1()
167 * Returns a value which will allow the caller to walk
168 * a collection of <code>collectionSize</code> values, without repeating or missing
169 * any, and spanning the collection from beginning to end at least once with
170 * <code>sampleSize</code> visited locations. Choosing a value
171 * that is relatively prime to the collection size ensures that stepping by that size (modulo
172 * the collection size) will hit all locations without repeating, eliminating the need to
173 * track previously visited locations for a "without replacement" sample. Starting with the
174 * square root of the collection size ensures that either the first or second prime tried will
175 * work (they can't both divide the collection size). It also has the property that N steps of
176 * size N will span a collection of N**2 elements once. If the sample is bigger than N, it will
177 * wrap multiple times (without repeating). If the sample is smaller, a step size is chosen
178 * that will result in at least one spanning of the collection.
180 * @param collectionSize The number of values in the collection to be sampled.
181 * @param sampleSize The number of values wanted in the sample.
182 * @return A good increment value for walking the collection.
184 private static int findGoodStepSize(int collectionSize, int sampleSize) {
185 int i = (int) Math.sqrt(collectionSize);
186 if (sampleSize < i) {
187 i = collectionSize / sampleSize;
190 i = RandomSample.findNextPrimeAfter(i);
191 } while (collectionSize % i == 0);
193 } // end RandomSample.findGoodStepSize()
196 * Returns the first prime number that is larger than <code>n</code>.
197 * @param n A number less than the prime to be returned.
198 * @return The smallest prime larger than <code>n</code>.
200 private static int findNextPrimeAfter(int n) {
201 n += (n % 2 == 0) ? 1 : 2; // next odd
202 foundFactor: for (;; n += 2) {
203 int sri = (int) (Math.sqrt(n));
204 for (int primeIndex = 0; primeIndex < RandomSample.N_PRIMES; primeIndex++) {
205 int p = RandomSample.primes[primeIndex];
210 continue foundFactor;
213 for (int p = RandomSample.primes[RandomSample.N_PRIMES - 1] + 2;; p += 2) {
218 continue foundFactor;
222 } // end RandomSample.findNextPrimeAfter()
225 * Divides the values in <code>collection</code> into <code>numSubranges</code>
226 * subranges from <code>minValue</code> to <code>maxValue</code> and returns the
227 * number of values in each subrange. (For testing the flatness of distribution of
229 * @param collection The collection of values to be counted.
230 * @param range The number of possible values.
231 * @param numSubranges How many intervals to divide the value range into.
233 private static int[] countsBySubrange(int[] collection, int range, int numSubranges) {
234 int[] counts = new int[numSubranges];
235 Arrays.fill(counts, 0);
236 int numInSubrange = range / numSubranges;
237 for (int j = 0; j < collection.length; j++) {
238 counts[collection[j] / numInSubrange]++;
241 } // end RandomSample.countsBySubrange()
244 * Factors <code>value</code> into primes.
246 public static int[] factor(long value) {
247 ArrayList<Integer> list = new ArrayList<Integer>();
248 while (value > 1 && value % 2 == 0) {
252 long sqrt = Math.round(Math.sqrt(value));
253 for (int pIndex = 0, lim = RandomSample.primes.length; pIndex < lim; pIndex++) {
254 int p = RandomSample.primes[pIndex];
258 while (value % p == 0) {
261 sqrt = Math.round(Math.sqrt(value));
264 if (list.size() == 0 || value > Integer.MAX_VALUE) {
265 throw new RuntimeException("Prime or too large to factor: "+value);
267 if ((int)value > 1) {
268 list.add((int)value);
270 int[] factors = new int[list.size()];
271 for (int j = 0; j < factors.length; j++) {
272 factors[j] = list.get(j).intValue();
275 } // end RandomSample.factor()
278 * The first N_PRIMES primes, after 2.
280 private static final int N_PRIMES = 4000;
281 private static int[] primes = new int[RandomSample.N_PRIMES];
283 RandomSample.primes[0] = 3;
284 for (int count = 1; count < RandomSample.N_PRIMES; count++) {
285 primes[count] = RandomSample.findNextPrimeAfter(primes[count - 1]);
290 * Returns <code>sample</code>.length values chosen from the first <code>collectionSize</code>
291 * locations of <code>collection</code>, using the HASHING algorithm. Performance measurements
292 * are returned in <code>times</code>, which must be an array of at least three longs. The first
293 * will be set when the algorithm starts; the second, when a hash key has been calculated and
294 * inserted into the priority queue for every element in the collection; and the third when the
295 * original elements associated with the keys remaining in the PQ have been stored in the sample
298 * This algorithm slows as the sample size becomes a significant fraction of the collection
299 * size, because the PQ is as large as the sample set, and will not do early rejection of values
300 * below the minimum until it fills up, and a larger PQ contains more small values to be purged,
301 * resulting in less early rejection and more logN insertions.
303 * @param collection The set to be sampled.
304 * @param collectionSize The number of values to use (starting from first).
305 * @param sample The array in which to return the sample.
306 * @param times The times of three events, for measuring performance.
308 private static void sample2(ScoredDocIDs collection, int collectionSize, int[] sample, long[] times)
310 if (RandomSample.returnTimings) {
311 times[0] = System.currentTimeMillis();
313 int sampleSize = sample.length;
314 IntPriorityQueue pq = new IntPriorityQueue(sampleSize);
316 * Convert every value in the collection to a hashed "weight" value, and insert
317 * into a bounded PQ (retains only sampleSize highest weights).
319 ScoredDocIDsIterator it = collection.iterator();
321 pq.insertWithReuse((int)(it.getDocID() * PHI_32) & 0x7FFFFFFF);
323 if (RandomSample.returnTimings) {
324 times[1] = System.currentTimeMillis();
327 * Extract heap, convert weights back to original values, and return as integers.
329 Object[] heap = pq.getHeap();
330 for (int si = 0; si < sampleSize; si++) {
331 sample[si] = (int)(((IntPriorityQueue.MI)(heap[si+1])).value * PHI_32I) & 0x7FFFFFFF;
333 if (RandomSample.returnTimings) {
334 times[2] = System.currentTimeMillis();
336 } // end RandomSample.sample2()
339 * A bounded priority queue for Integers, to retain a specified number of
340 * the highest-weighted values for return as a random sample.
342 private static class IntPriorityQueue extends PriorityQueue<Object> {
345 * Creates a bounded PQ of size <code>size</code>.
346 * @param size The number of elements to retain.
348 public IntPriorityQueue(int size) {
350 this.initialize(size);
354 * Inserts an integer with overflow and object reuse.
356 public void insertWithReuse(int intval) {
357 if (this.mi == null) {
360 this.mi.value = intval;
361 this.mi = (MI)this.insertWithOverflow(this.mi);
362 } // end IntPriorityQueue.insertWithReuse()
365 * Returns the underlying data structure for faster access. Extracting elements
366 * one at a time would require N logN time, and since we want the elements sorted
367 * in ascending order by value (not weight), the array is useful as-is.
368 * @return The underlying heap array.
370 public Object[] getHeap() {
371 return getHeapArray();
375 * Returns true if <code>o1<code>'s weight is less than that of <code>o2</code>, for
376 * ordering in the PQ.
377 * @return True if <code>o1</code> weighs less than <code>o2</code>.
380 public boolean lessThan(Object o1, Object o2) {
381 return ((MI)o1).value < ((MI)o2).value;
385 * A mutable integer that lets queue objects be reused once they start overflowing.
387 private static class MI {
390 } // end class RandomSample.IntPriorityQueue.MI
393 * The mutable integer instance for reuse after first overflow.
397 } // end class RandomSample.IntPriorityQueue
400 * For specifying which sampling algorithm to use.
402 public static class Algorithm {
405 * Specifies a methodical traversal algorithm, which is guaranteed to span the collection
406 * at least once, and never to return duplicates. Faster than the hashing algorithm and
407 * uses much less space, but the randomness of the sample may be affected by systematic
408 * variations in the collection. Requires only an array for the sample, and visits only
409 * the number of elements in the sample set, not the full set.
411 // TODO (Facet): This one produces a bimodal distribution (very flat around
412 // each peak!) for collection size 10M and sample sizes 10k and 10544.
414 public static final Algorithm TRAVERSAL = new Algorithm("Traversal");
417 * Specifies a Fibonacci-style hash algorithm (see Knuth, S&S), which generates a less
418 * systematically distributed subset of the sampled collection than the traversal method,
419 * but requires a bounded priority queue the size of the sample, and creates an object
420 * containing a sampled value and its hash, for every element in the full set.
422 public static final Algorithm HASHING = new Algorithm("Hashing");
425 * Constructs an instance of an algorithm.
426 * @param name An ID for printing.
428 private Algorithm(String name) {
433 * Prints this algorithm's name.
436 public String toString() {
441 * The name of this algorithm, for printing.
445 } // end class RandomSample.Algorithm
448 * For specifying whether to sort the sample.
450 public static class Sorted {
453 * Specifies sorting the resulting sample before returning.
455 public static final Sorted YES = new Sorted("sorted");
458 * Specifies not sorting the resulting sample.
460 public static final Sorted NO = new Sorted("unsorted");
463 * Constructs an instance of a "sorted" selector.
464 * @param name An ID for printing.
466 private Sorted(String name) {
471 * Prints this selector's name.
474 public String toString() {
479 * The name of this selector, for printing.
483 } // end class RandomSample.Sorted
486 * Magic number 1: prime closest to phi, in 32 bits.
488 private static final long PHI_32 = 2654435769L;
491 * Magic number 2: multiplicative inverse of PHI_32, modulo 2**32.
493 private static final long PHI_32I = 340573321L;
496 * Switch to cause methods to return timings.
498 private static boolean returnTimings = false;
503 public static void main(String[] args) throws Exception {
504 RandomSample.returnTimings = true;
506 * Create an array of sequential integers, from which samples will be taken.
508 final int COLLECTION_SIZE = 10 * 1000 * 1000;
509 ScoredDocIDs collection = createAllScoredDocs(COLLECTION_SIZE);
514 int[] factors = RandomSample.factor(PHI_32);
515 System.out.print("Factors of PHI_32: ");
516 for (int k : factors) {
517 System.out.print(k+", ");
519 System.out.println("");
521 * Verify inverse relationship of PHI & phi.
523 boolean inverseValid = true;
524 for (int j = 0; j < Integer.MAX_VALUE; j++) {
525 int k = (int)(j * PHI_32) & 0x7FFFFFFF;
526 int m = (int)(k * PHI_32I) & 0X7FFFFFFF;
528 System.out.println("Inverse not valid for "+j);
529 inverseValid = false;
532 System.out.println("Inverse valid? "+inverseValid);
535 * Take samples of various sizes from the full set, verify no duplicates,
538 int[] sampleSizes = {
539 10, 57, 100, 333, 1000, 2154, 10000
541 Algorithm[] algorithms = { Algorithm.HASHING, Algorithm.TRAVERSAL };
542 for (int sampleSize : sampleSizes) {
543 for (Algorithm algorithm : algorithms) {
544 System.out.println("Sample size " + sampleSize
545 + ", algorithm " + algorithm + "...");
549 int[] sample = RandomSample.repeatableSample(
550 collection, COLLECTION_SIZE, sampleSize, algorithm, Sorted.YES);
552 * Check for duplicates.
554 boolean noDups = true;
555 for (int j = 0; j < sampleSize - 1; j++) {
556 if (sample[j] == sample[j + 1]) {
557 System.out.println("Duplicate value "
558 + sample[j] + " at " + j + ", "
565 System.out.println("No duplicates.");
567 if (algorithm == Algorithm.HASHING) {
568 System.out.print("Hashed sample, up to 100 of "+sampleSize+": ");
569 int lim = Math.min(100, sampleSize);
570 for (int k = 0; k < lim; k++) {
571 System.out.print(sample[k]+", ");
573 System.out.println("");
576 * Check flatness of distribution in sample.
578 final int N_INTERVALS = 100;
579 int[] counts = RandomSample.countsBySubrange(sample, COLLECTION_SIZE, N_INTERVALS);
580 int minCount = Integer.MAX_VALUE;
581 int maxCount = Integer.MIN_VALUE;
583 for (int j = 0; j < N_INTERVALS; j++) {
584 int count = counts[j];
585 if (count < minCount) {
588 if (count > maxCount) {
593 avgCount /= N_INTERVALS;
594 System.out.println("Min, max, avg: "+minCount+", "+maxCount+", "+avgCount);
596 if (((double)minCount - avgCount)/avgCount < -0.05 && (minCount - avgCount) < -5) {
597 System.out.println("Not flat enough.");
598 } else if (((double)maxCount - avgCount)/avgCount > 0.05 && (maxCount - avgCount) > 5) {
599 System.out.println("Not flat enough.");
601 System.out.println("Flat enough.");
603 if (sampleSize == 10544 && algorithm == Algorithm.TRAVERSAL) {
604 System.out.print("Counts of interest: ");
605 for (int j = 0; j < N_INTERVALS; j++) {
606 System.out.print(counts[j]+", ");
608 System.out.println("");
612 System.out.println("Last prime is "
613 + RandomSample.primes[RandomSample.N_PRIMES - 1]);
616 private static ScoredDocIDs createAllScoredDocs(final int COLLECTION_SIZE)
617 throws CorruptIndexException, LockObtainFailedException, IOException {
618 ScoredDocIDs collection;
620 IndexReader reader = null;
621 Directory ramDir = new RAMDirectory();
623 IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_30, new KeywordAnalyzer()));
624 for (int i = 0; i < COLLECTION_SIZE; i++) {
625 writer.addDocument(new Document());
629 reader = IndexReader.open(ramDir);
630 collection = ScoredDocIdsUtils.createAllDocsScoredDocIDs(reader);
632 if (reader != null) {
639 } // end class RandomSample