lucene-java-3.5.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/search/sampling/RepeatableSampler.java

   1 package org.apache.lucene.facet.search.sampling;
   2
   3 import java.io.IOException;
   4 import java.util.Arrays;
   5 import java.util.logging.Level;
   6 import java.util.logging.Logger;
   7
   8 import org.apache.lucene.util.PriorityQueue;
   9
  10 import org.apache.lucene.facet.search.ScoredDocIDs;
  11 import org.apache.lucene.facet.search.ScoredDocIDsIterator;
  12 import org.apache.lucene.facet.util.ScoredDocIdsUtils;
  13
  14 /**
  15  * Licensed to the Apache Software Foundation (ASF) under one or more
  16  * contributor license agreements.  See the NOTICE file distributed with
  17  * this work for additional information regarding copyright ownership.
  18  * The ASF licenses this file to You under the Apache License, Version 2.0
  19  * (the "License"); you may not use this file except in compliance with
  20  * the License.  You may obtain a copy of the License at
  21  *
  22  *     http://www.apache.org/licenses/LICENSE-2.0
  23  *
  24  * Unless required by applicable law or agreed to in writing, software
  25  * distributed under the License is distributed on an "AS IS" BASIS,
  26  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  27  * See the License for the specific language governing permissions and
  28  * limitations under the License.
  29  */
  30
  31 /**
  32  * Take random samples of large collections.
  33  * @lucene.experimental
  34  */
  35 public class RepeatableSampler extends Sampler {
  36
  37   private static final Logger logger = Logger.getLogger(RepeatableSampler.class.getName());
  38
  39   public RepeatableSampler(SamplingParams params) {
  40     super(params);
  41   }
  42
  43   @Override
  44   protected SampleResult createSample(ScoredDocIDs docids, int actualSize,
  45       int sampleSetSize) throws IOException {
  46     int[] sampleSet = null;
  47     try {
  48       sampleSet = repeatableSample(docids, actualSize,
  49           sampleSetSize);
  50     } catch (IOException e) {
  51       if (logger.isLoggable(Level.WARNING)) {
  52         logger.log(Level.WARNING, "sampling failed: "+e.getMessage()+" - falling back to no sampling!", e);
  53       }
  54       return new SampleResult(docids, 1d);
  55     }
  56
  57     ScoredDocIDs sampled = ScoredDocIdsUtils.createScoredDocIDsSubset(docids,
  58         sampleSet);
  59     if (logger.isLoggable(Level.FINEST)) {
  60       logger.finest("******************** " + sampled.size());
  61     }
  62     return new SampleResult(sampled, sampled.size()/(double)docids.size());
  63   }
  64
  65   /**
  66    * Returns <code>sampleSize</code> values from the first <code>collectionSize</code>
  67    * locations of <code>collection</code>, chosen using
  68    * the <code>TRAVERSAL</code> algorithm. The sample values are not sorted.
  69    * @param collection The values from which a sample is wanted.
  70    * @param collectionSize The number of values (from the first) from which to draw the sample.
  71    * @param sampleSize The number of values to return.
  72    * @return An array of values chosen from the collection.
  73    * @see Algorithm#TRAVERSAL
  74    */
  75   private static int[] repeatableSample(ScoredDocIDs collection,
  76       int collectionSize, int sampleSize)
  77   throws IOException {
  78     return repeatableSample(collection, collectionSize,
  79         sampleSize, Algorithm.HASHING, Sorted.NO);
  80   }
  81
  82   /**
  83    * Returns <code>sampleSize</code> values from the first <code>collectionSize</code>
  84    * locations of <code>collection</code>, chosen using <code>algorithm</code>.
  85    * @param collection The values from which a sample is wanted.
  86    * @param collectionSize The number of values (from the first) from which to draw the sample.
  87    * @param sampleSize The number of values to return.
  88    * @param algorithm Which algorithm to use.
  89    * @param sorted Sorted.YES to sort the sample values in ascending order before returning;
  90    * Sorted.NO to return them in essentially random order.
  91    * @return An array of values chosen from the collection.
  92    */
  93   private static int[] repeatableSample(ScoredDocIDs collection,
  94       int collectionSize, int sampleSize,
  95       Algorithm algorithm, Sorted sorted)
  96   throws IOException {
  97     if (collection == null) {
  98       throw new IOException("docIdSet is null");
  99     }
 100     if (sampleSize < 1) {
 101       throw new IOException("sampleSize < 1 (" + sampleSize + ")");
 102     }
 103     if (collectionSize < sampleSize) {
 104       throw new IOException("collectionSize (" + collectionSize + ") less than sampleSize (" + sampleSize + ")");
 105     }
 106     int[] sample = new int[sampleSize];
 107     long[] times = new long[4];
 108     if (algorithm == Algorithm.TRAVERSAL) {
 109       sample1(collection, collectionSize, sample, times);
 110     } else if (algorithm == Algorithm.HASHING) {
 111       sample2(collection, collectionSize, sample, times);
 112     } else {
 113       throw new IllegalArgumentException("Invalid algorithm selection");
 114     }
 115     if (sorted == Sorted.YES) {
 116       Arrays.sort(sample);
 117     }
 118     if (returnTimings) {
 119       times[3] = System.currentTimeMillis();
 120       if (logger.isLoggable(Level.FINEST)) {
 121         logger.finest("Times: " + (times[1] - times[0]) + "ms, "
 122             + (times[2] - times[1]) + "ms, " + (times[3] - times[2])+"ms");
 123       }
 124     }
 125     return sample;
 126   }
 127
 128   /**
 129    * Returns <code>sample</code>.length values chosen from the first <code>collectionSize</code>
 130    * locations of <code>collection</code>, using the TRAVERSAL algorithm. The sample is
 131    * pseudorandom: no subset of the original collection
 132    * is in principle more likely to occur than any other, but for a given collection
 133    * and sample size, the same sample will always be returned. This algorithm walks the
 134    * original collection in a methodical way that is guaranteed not to visit any location
 135    * more than once, which makes sampling without replacement faster because removals don't
 136    * have to be tracked, and the number of operations is proportional to the sample size,
 137    * not the collection size.
 138    * Times for performance measurement
 139    * are returned in <code>times</code>, which must be an array of at least three longs, containing
 140    * nanosecond event times. The first
 141    * is set when the algorithm starts; the second, when the step size has been calculated;
 142    * and the third when the sample has been taken.
 143    * @param collection The set to be sampled.
 144    * @param collectionSize The number of values to use (starting from first).
 145    * @param sample The array in which to return the sample.
 146    * @param times The times of three events, for measuring performance.
 147    */
 148   private static void sample1(ScoredDocIDs collection, int collectionSize, int[] sample, long[] times)
 149   throws IOException {
 150     ScoredDocIDsIterator it = collection.iterator();
 151     if (returnTimings) {
 152       times[0] = System.currentTimeMillis();
 153     }
 154     int sampleSize = sample.length;
 155     int prime = findGoodStepSize(collectionSize, sampleSize);
 156     int mod = prime % collectionSize;
 157     if (returnTimings) {
 158       times[1] = System.currentTimeMillis();
 159     }
 160     int sampleCount = 0;
 161     int index = 0;
 162     for (; sampleCount < sampleSize;) {
 163       if (index + mod < collectionSize) {
 164         for (int i = 0; i < mod; i++, index++) {
 165           it.next();
 166         }
 167       } else {
 168         index = index + mod - collectionSize;
 169         it = collection.iterator();
 170         for (int i = 0; i < index; i++) {
 171           it.next();
 172         }
 173       }
 174       sample[sampleCount++] = it.getDocID();
 175     }
 176     if (returnTimings) {
 177       times[2] = System.currentTimeMillis();
 178     }
 179   }
 180
 181   /**
 182    * Returns a value which will allow the caller to walk
 183    * a collection of <code>collectionSize</code> values, without repeating or missing
 184    * any, and spanning the collection from beginning to end at least once with
 185    * <code>sampleSize</code> visited locations. Choosing a value
 186    * that is relatively prime to the collection size ensures that stepping by that size (modulo
 187    * the collection size) will hit all locations without repeating, eliminating the need to
 188    * track previously visited locations for a "without replacement" sample. Starting with the
 189    * square root of the collection size ensures that either the first or second prime tried will
 190    * work (they can't both divide the collection size). It also has the property that N steps of
 191    * size N will span a collection of N**2 elements once. If the sample is bigger than N, it will
 192    * wrap multiple times (without repeating). If the sample is smaller, a step size is chosen
 193    * that will result in at least one spanning of the collection.
 194    *
 195    * @param collectionSize The number of values in the collection to be sampled.
 196    * @param sampleSize The number of values wanted in the sample.
 197    * @return A good increment value for walking the collection.
 198    */
 199   private static int findGoodStepSize(int collectionSize, int sampleSize) {
 200     int i = (int) Math.sqrt(collectionSize);
 201     if (sampleSize < i) {
 202       i = collectionSize / sampleSize;
 203     }
 204     do {
 205       i = findNextPrimeAfter(i);
 206     } while (collectionSize % i == 0);
 207     return i;
 208   }
 209
 210   /**
 211    * Returns the first prime number that is larger than <code>n</code>.
 212    * @param n A number less than the prime to be returned.
 213    * @return The smallest prime larger than <code>n</code>.
 214    */
 215   private static int findNextPrimeAfter(int n) {
 216     n += (n % 2 == 0) ? 1 : 2; // next odd
 217     foundFactor: for (;; n += 2) { //TODO labels??!!
 218       int sri = (int) (Math.sqrt(n));
 219       for (int primeIndex = 0; primeIndex < N_PRIMES; primeIndex++) {
 220         int p = primes[primeIndex];
 221         if (p > sri) {
 222           return n;
 223         }
 224         if (n % p == 0) {
 225           continue foundFactor;
 226         }
 227       }
 228       for (int p = primes[N_PRIMES - 1] + 2;; p += 2) {
 229         if (p > sri) {
 230           return n;
 231         }
 232         if (n % p == 0) {
 233           continue foundFactor;
 234         }
 235       }
 236     }
 237   }
 238
 239   /**
 240    * The first N_PRIMES primes, after 2.
 241    */
 242   private static final int N_PRIMES = 4000;
 243   private static int[] primes = new int[N_PRIMES];
 244   static {
 245     primes[0] = 3;
 246     for (int count = 1; count < N_PRIMES; count++) {
 247       primes[count] = findNextPrimeAfter(primes[count - 1]);
 248     }
 249   }
 250
 251   /**
 252    * Returns <code>sample</code>.length values chosen from the first <code>collectionSize</code>
 253    * locations of <code>collection</code>, using the HASHING algorithm. Performance measurements
 254    * are returned in <code>times</code>, which must be an array of at least three longs. The first
 255    * will be set when the algorithm starts; the second, when a hash key has been calculated and
 256    * inserted into the priority queue for every element in the collection; and the third when the
 257    * original elements associated with the keys remaining in the PQ have been stored in the sample
 258    * array for return.
 259    * <P>
 260    * This algorithm slows as the sample size becomes a significant fraction of the collection
 261    * size, because the PQ is as large as the sample set, and will not do early rejection of values
 262    * below the minimum until it fills up, and a larger PQ contains more small values to be purged,
 263    * resulting in less early rejection and more logN insertions.
 264    *
 265    * @param collection The set to be sampled.
 266    * @param collectionSize The number of values to use (starting from first).
 267    * @param sample The array in which to return the sample.
 268    * @param times The times of three events, for measuring performance.
 269    */
 270   private static void sample2(ScoredDocIDs collection, int collectionSize, int[] sample, long[] times)
 271   throws IOException {
 272     if (returnTimings) {
 273       times[0] = System.currentTimeMillis();
 274     }
 275     int sampleSize = sample.length;
 276     IntPriorityQueue pq = new IntPriorityQueue(sampleSize);
 277     /*
 278      * Convert every value in the collection to a hashed "weight" value, and insert
 279      * into a bounded PQ (retains only sampleSize highest weights).
 280      */
 281     ScoredDocIDsIterator it = collection.iterator();
 282     while (it.next()) {
 283       pq.insertWithReuse((int)(it.getDocID() * PHI_32) & 0x7FFFFFFF);
 284     }
 285     if (returnTimings) {
 286       times[1] = System.currentTimeMillis();
 287     }
 288     /*
 289      * Extract heap, convert weights back to original values, and return as integers.
 290      */
 291     Object[] heap = pq.getHeap();
 292     for (int si = 0; si < sampleSize; si++) {
 293       sample[si] = (int)(((IntPriorityQueue.MI)(heap[si+1])).value * PHI_32I) & 0x7FFFFFFF;
 294     }
 295     if (returnTimings) {
 296       times[2] = System.currentTimeMillis();
 297     }
 298   }
 299
 300   /**
 301    * A bounded priority queue for Integers, to retain a specified number of
 302    * the highest-weighted values for return as a random sample.
 303    */
 304   private static class IntPriorityQueue extends PriorityQueue<Object> {
 305
 306     /**
 307      * Creates a bounded PQ of size <code>size</code>.
 308      * @param size The number of elements to retain.
 309      */
 310     public IntPriorityQueue(int size) {
 311       initialize(size);
 312     }
 313
 314     /**
 315      * Inserts an integer with overflow and object reuse.
 316      */
 317     public void insertWithReuse(int intval) {
 318       if (this.mi == null) {
 319         this.mi = new MI();
 320       }
 321       this.mi.value = intval;
 322       this.mi = (MI)this.insertWithOverflow(this.mi);
 323     }
 324
 325     /**
 326      * Returns the underlying data structure for faster access. Extracting elements
 327      * one at a time would require N logN time, and since we want the elements sorted
 328      * in ascending order by value (not weight), the array is useful as-is.
 329      * @return The underlying heap array.
 330      */
 331     public Object[] getHeap() {
 332       return getHeapArray();
 333     }
 334
 335     /**
 336      * Returns true if <code>o1<code>'s weight is less than that of <code>o2</code>, for
 337      * ordering in the PQ.
 338      * @return True if <code>o1</code> weighs less than <code>o2</code>.
 339      */
 340     @Override
 341     public boolean lessThan(Object o1, Object o2) {
 342       return ((MI)o1).value < ((MI)o2).value;
 343     }
 344
 345     /**
 346      * A mutable integer that lets queue objects be reused once they start overflowing.
 347      */
 348     private static class MI {
 349       MI() { }
 350       public int value;
 351     }
 352
 353     /**
 354      * The mutable integer instance for reuse after first overflow.
 355      */
 356     private MI mi;
 357
 358   }
 359
 360   /**
 361    * For specifying which sampling algorithm to use.
 362    */
 363   private enum Algorithm {
 364
 365     /**
 366      * Specifies a methodical traversal algorithm, which is guaranteed to span the collection
 367      * at least once, and never to return duplicates. Faster than the hashing algorithm and
 368      * uses much less space, but the randomness of the sample may be affected by systematic
 369      * variations in the collection. Requires only an array for the sample, and visits only
 370      * the number of elements in the sample set, not the full set.
 371      */
 372     // TODO (Facet): This one produces a bimodal distribution (very flat around
 373     // each peak!) for collection size 10M and sample sizes 10k and 10544.
 374     // Figure out why.
 375     TRAVERSAL,
 376
 377     /**
 378      * Specifies a Fibonacci-style hash algorithm (see Knuth, S&S), which generates a less
 379      * systematically distributed subset of the sampled collection than the traversal method,
 380      * but requires a bounded priority queue the size of the sample, and creates an object
 381      * containing a sampled value and its hash, for every element in the full set.
 382      */
 383     HASHING
 384   }
 385
 386   /**
 387    * For specifying whether to sort the sample.
 388    */
 389   private enum Sorted {
 390
 391     /**
 392      * Sort resulting sample before returning.
 393      */
 394     YES,
 395
 396     /**
 397      *Do not sort the resulting sample.
 398      */
 399     NO
 400   }
 401
 402   /**
 403    * Magic number 1: prime closest to phi, in 32 bits.
 404    */
 405   private static final long PHI_32 = 2654435769L;
 406
 407   /**
 408    * Magic number 2: multiplicative inverse of PHI_32, modulo 2**32.
 409    */
 410   private static final long PHI_32I = 340573321L;
 411
 412   /**
 413    * Switch to cause methods to return timings.
 414    */
 415   private static boolean returnTimings = false;
 416
 417 }