X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/util/RandomSample.java diff --git a/lucene-java-3.4.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/util/RandomSample.java b/lucene-java-3.4.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/util/RandomSample.java deleted file mode 100644 index f6ffc59..0000000 --- a/lucene-java-3.4.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/util/RandomSample.java +++ /dev/null @@ -1,639 +0,0 @@ -package org.apache.lucene.facet.util; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.logging.Level; -import java.util.logging.Logger; - -import org.apache.lucene.analysis.KeywordAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.index.CorruptIndexException; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.LockObtainFailedException; -import org.apache.lucene.store.RAMDirectory; -import org.apache.lucene.util.PriorityQueue; -import org.apache.lucene.util.Version; - -import org.apache.lucene.facet.search.ScoredDocIDs; -import org.apache.lucene.facet.search.ScoredDocIDsIterator; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Take random samples of large collections. - * - * @lucene.experimental - */ -public class RandomSample { - - private static final Logger logger = Logger.getLogger(RandomSample.class.getName()); - - /** - * Returns sampleSize values from the first collectionSize - * locations of collection, chosen using - * the TRAVERSAL algorithm. The sample values are not sorted. - * @param collection The values from which a sample is wanted. - * @param collectionSize The number of values (from the first) from which to draw the sample. - * @param sampleSize The number of values to return. - * @return An array of values chosen from the collection. - * @see Algorithm#TRAVERSAL - */ - public static int[] repeatableSample(ScoredDocIDs collection, - int collectionSize, int sampleSize) - throws IOException { - return RandomSample.repeatableSample(collection, collectionSize, - sampleSize, Algorithm.HASHING, Sorted.NO); - } - - /** - * Returns sampleSize values from the first collectionSize - * locations of collection, chosen using algorithm. - * @param collection The values from which a sample is wanted. - * @param collectionSize The number of values (from the first) from which to draw the sample. - * @param sampleSize The number of values to return. - * @param algorithm Which algorithm to use. - * @param sorted Sorted.YES to sort the sample values in ascending order before returning; - * Sorted.NO to return them in essentially random order. - * @return An array of values chosen from the collection. - */ - public static int[] repeatableSample(ScoredDocIDs collection, - int collectionSize, int sampleSize, - Algorithm algorithm, Sorted sorted) - throws IOException { - if (collection == null) { - throw new IOException("docIdSet is null"); - } - if (sampleSize < 1) { - throw new IOException("sampleSize < 1 (" + sampleSize + ")"); - } - if (collectionSize < sampleSize) { - throw new IOException("collectionSize (" + collectionSize + ") less than sampleSize (" + sampleSize + ")"); - } - int[] sample = new int[sampleSize]; - long[] times = new long[4]; - if (algorithm == Algorithm.TRAVERSAL) { - RandomSample.sample1(collection, collectionSize, sample, times); - } else if (algorithm == Algorithm.HASHING) { - RandomSample.sample2(collection, collectionSize, sample, times); - } else { - throw new IllegalArgumentException("Invalid algorithm selection"); - } - if (sorted == Sorted.YES) { - Arrays.sort(sample); - } - if (RandomSample.returnTimings) { - times[3] = System.currentTimeMillis(); - if (logger.isLoggable(Level.FINEST)) { - logger.finest("Times: " + (times[1] - times[0]) + "ms, " - + (times[2] - times[1]) + "ms, " + (times[3] - times[2])+"ms"); - } - } - return sample; - } - - /** - * Returns sample.length values chosen from the first collectionSize - * locations of collection, using the TRAVERSAL algorithm. The sample is - * pseudorandom: no subset of the original collection - * is in principle more likely to occur than any other, but for a given collection - * and sample size, the same sample will always be returned. This algorithm walks the - * original collection in a methodical way that is guaranteed not to visit any location - * more than once, which makes sampling without replacement faster because removals don't - * have to be tracked, and the number of operations is proportional to the sample size, - * not the collection size. - * Times for performance measurement - * are returned in times, which must be an array of at least three longs, containing - * nanosecond event times. The first - * is set when the algorithm starts; the second, when the step size has been calculated; - * and the third when the sample has been taken. - * @param collection The set to be sampled. - * @param collectionSize The number of values to use (starting from first). - * @param sample The array in which to return the sample. - * @param times The times of three events, for measuring performance. - */ - private static void sample1(ScoredDocIDs collection, int collectionSize, int[] sample, long[] times) - throws IOException { - ScoredDocIDsIterator it = collection.iterator(); - if (RandomSample.returnTimings) { - times[0] = System.currentTimeMillis(); - } - int sampleSize = sample.length; - int prime = RandomSample.findGoodStepSize(collectionSize, sampleSize); - int mod = prime % collectionSize; - if (RandomSample.returnTimings) { - times[1] = System.currentTimeMillis(); - } - int sampleCount = 0; - int index = 0; - for (; sampleCount < sampleSize;) { - if (index + mod < collectionSize) { - for (int i = 0; i < mod; i++, index++) { - it.next(); - } - } else { - index = index + mod - collectionSize; - it = collection.iterator(); - for (int i = 0; i < index; i++) { - it.next(); - } - } - sample[sampleCount++] = it.getDocID(); - } - if (RandomSample.returnTimings) { - times[2] = System.currentTimeMillis(); - } - } // end RandomSample.sample1() - - /** - * Returns a value which will allow the caller to walk - * a collection of collectionSize values, without repeating or missing - * any, and spanning the collection from beginning to end at least once with - * sampleSize visited locations. Choosing a value - * that is relatively prime to the collection size ensures that stepping by that size (modulo - * the collection size) will hit all locations without repeating, eliminating the need to - * track previously visited locations for a "without replacement" sample. Starting with the - * square root of the collection size ensures that either the first or second prime tried will - * work (they can't both divide the collection size). It also has the property that N steps of - * size N will span a collection of N**2 elements once. If the sample is bigger than N, it will - * wrap multiple times (without repeating). If the sample is smaller, a step size is chosen - * that will result in at least one spanning of the collection. - * - * @param collectionSize The number of values in the collection to be sampled. - * @param sampleSize The number of values wanted in the sample. - * @return A good increment value for walking the collection. - */ - private static int findGoodStepSize(int collectionSize, int sampleSize) { - int i = (int) Math.sqrt(collectionSize); - if (sampleSize < i) { - i = collectionSize / sampleSize; - } - do { - i = RandomSample.findNextPrimeAfter(i); - } while (collectionSize % i == 0); - return i; - } // end RandomSample.findGoodStepSize() - - /** - * Returns the first prime number that is larger than n. - * @param n A number less than the prime to be returned. - * @return The smallest prime larger than n. - */ - private static int findNextPrimeAfter(int n) { - n += (n % 2 == 0) ? 1 : 2; // next odd - foundFactor: for (;; n += 2) { - int sri = (int) (Math.sqrt(n)); - for (int primeIndex = 0; primeIndex < RandomSample.N_PRIMES; primeIndex++) { - int p = RandomSample.primes[primeIndex]; - if (p > sri) { - return n; - } - if (n % p == 0) { - continue foundFactor; - } - } - for (int p = RandomSample.primes[RandomSample.N_PRIMES - 1] + 2;; p += 2) { - if (p > sri) { - return n; - } - if (n % p == 0) { - continue foundFactor; - } - } - } - } // end RandomSample.findNextPrimeAfter() - - /** - * Divides the values in collection into numSubranges - * subranges from minValue to maxValue and returns the - * number of values in each subrange. (For testing the flatness of distribution of - * a sample.) - * @param collection The collection of values to be counted. - * @param range The number of possible values. - * @param numSubranges How many intervals to divide the value range into. - */ - private static int[] countsBySubrange(int[] collection, int range, int numSubranges) { - int[] counts = new int[numSubranges]; - Arrays.fill(counts, 0); - int numInSubrange = range / numSubranges; - for (int j = 0; j < collection.length; j++) { - counts[collection[j] / numInSubrange]++; - } - return counts; - } // end RandomSample.countsBySubrange() - - /** - * Factors value into primes. - */ - public static int[] factor(long value) { - ArrayList list = new ArrayList(); - while (value > 1 && value % 2 == 0) { - list.add(2); - value /= 2; - } - long sqrt = Math.round(Math.sqrt(value)); - for (int pIndex = 0, lim = RandomSample.primes.length; pIndex < lim; pIndex++) { - int p = RandomSample.primes[pIndex]; - if (p >= sqrt) { - break; - } - while (value % p == 0) { - list.add(p); - value /= p; - sqrt = Math.round(Math.sqrt(value)); - } - } - if (list.size() == 0 || value > Integer.MAX_VALUE) { - throw new RuntimeException("Prime or too large to factor: "+value); - } - if ((int)value > 1) { - list.add((int)value); - } - int[] factors = new int[list.size()]; - for (int j = 0; j < factors.length; j++) { - factors[j] = list.get(j).intValue(); - } - return factors; - } // end RandomSample.factor() - - /** - * The first N_PRIMES primes, after 2. - */ - private static final int N_PRIMES = 4000; - private static int[] primes = new int[RandomSample.N_PRIMES]; - static { - RandomSample.primes[0] = 3; - for (int count = 1; count < RandomSample.N_PRIMES; count++) { - primes[count] = RandomSample.findNextPrimeAfter(primes[count - 1]); - } - } - - /** - * Returns sample.length values chosen from the first collectionSize - * locations of collection, using the HASHING algorithm. Performance measurements - * are returned in times, which must be an array of at least three longs. The first - * will be set when the algorithm starts; the second, when a hash key has been calculated and - * inserted into the priority queue for every element in the collection; and the third when the - * original elements associated with the keys remaining in the PQ have been stored in the sample - * array for return. - *

- * This algorithm slows as the sample size becomes a significant fraction of the collection - * size, because the PQ is as large as the sample set, and will not do early rejection of values - * below the minimum until it fills up, and a larger PQ contains more small values to be purged, - * resulting in less early rejection and more logN insertions. - * - * @param collection The set to be sampled. - * @param collectionSize The number of values to use (starting from first). - * @param sample The array in which to return the sample. - * @param times The times of three events, for measuring performance. - */ - private static void sample2(ScoredDocIDs collection, int collectionSize, int[] sample, long[] times) - throws IOException { - if (RandomSample.returnTimings) { - times[0] = System.currentTimeMillis(); - } - int sampleSize = sample.length; - IntPriorityQueue pq = new IntPriorityQueue(sampleSize); - /* - * Convert every value in the collection to a hashed "weight" value, and insert - * into a bounded PQ (retains only sampleSize highest weights). - */ - ScoredDocIDsIterator it = collection.iterator(); - while (it.next()) { - pq.insertWithReuse((int)(it.getDocID() * PHI_32) & 0x7FFFFFFF); - } - if (RandomSample.returnTimings) { - times[1] = System.currentTimeMillis(); - } - /* - * Extract heap, convert weights back to original values, and return as integers. - */ - Object[] heap = pq.getHeap(); - for (int si = 0; si < sampleSize; si++) { - sample[si] = (int)(((IntPriorityQueue.MI)(heap[si+1])).value * PHI_32I) & 0x7FFFFFFF; - } - if (RandomSample.returnTimings) { - times[2] = System.currentTimeMillis(); - } - } // end RandomSample.sample2() - - /** - * A bounded priority queue for Integers, to retain a specified number of - * the highest-weighted values for return as a random sample. - */ - private static class IntPriorityQueue extends PriorityQueue { - - /** - * Creates a bounded PQ of size size. - * @param size The number of elements to retain. - */ - public IntPriorityQueue(int size) { - super(); - this.initialize(size); - } - - /** - * Inserts an integer with overflow and object reuse. - */ - public void insertWithReuse(int intval) { - if (this.mi == null) { - this.mi = new MI(); - } - this.mi.value = intval; - this.mi = (MI)this.insertWithOverflow(this.mi); - } // end IntPriorityQueue.insertWithReuse() - - /** - * Returns the underlying data structure for faster access. Extracting elements - * one at a time would require N logN time, and since we want the elements sorted - * in ascending order by value (not weight), the array is useful as-is. - * @return The underlying heap array. - */ - public Object[] getHeap() { - return getHeapArray(); - } - - /** - * Returns true if o1's weight is less than that of o2, for - * ordering in the PQ. - * @return True if o1 weighs less than o2. - */ - @Override - public boolean lessThan(Object o1, Object o2) { - return ((MI)o1).value < ((MI)o2).value; - } - - /** - * A mutable integer that lets queue objects be reused once they start overflowing. - */ - private static class MI { - MI() { } - public int value; - } // end class RandomSample.IntPriorityQueue.MI - - /** - * The mutable integer instance for reuse after first overflow. - */ - private MI mi; - - } // end class RandomSample.IntPriorityQueue - - /** - * For specifying which sampling algorithm to use. - */ - public static class Algorithm { - - /** - * Specifies a methodical traversal algorithm, which is guaranteed to span the collection - * at least once, and never to return duplicates. Faster than the hashing algorithm and - * uses much less space, but the randomness of the sample may be affected by systematic - * variations in the collection. Requires only an array for the sample, and visits only - * the number of elements in the sample set, not the full set. - */ - // TODO (Facet): This one produces a bimodal distribution (very flat around - // each peak!) for collection size 10M and sample sizes 10k and 10544. - // Figure out why. - public static final Algorithm TRAVERSAL = new Algorithm("Traversal"); - - /** - * Specifies a Fibonacci-style hash algorithm (see Knuth, S&S), which generates a less - * systematically distributed subset of the sampled collection than the traversal method, - * but requires a bounded priority queue the size of the sample, and creates an object - * containing a sampled value and its hash, for every element in the full set. - */ - public static final Algorithm HASHING = new Algorithm("Hashing"); - - /** - * Constructs an instance of an algorithm. - * @param name An ID for printing. - */ - private Algorithm(String name) { - this.name = name; - } - - /** - * Prints this algorithm's name. - */ - @Override - public String toString() { - return this.name; - } - - /** - * The name of this algorithm, for printing. - */ - private String name; - - } // end class RandomSample.Algorithm - - /** - * For specifying whether to sort the sample. - */ - public static class Sorted { - - /** - * Specifies sorting the resulting sample before returning. - */ - public static final Sorted YES = new Sorted("sorted"); - - /** - * Specifies not sorting the resulting sample. - */ - public static final Sorted NO = new Sorted("unsorted"); - - /** - * Constructs an instance of a "sorted" selector. - * @param name An ID for printing. - */ - private Sorted(String name) { - this.name = name; - } - - /** - * Prints this selector's name. - */ - @Override - public String toString() { - return this.name; - } - - /** - * The name of this selector, for printing. - */ - private String name; - - } // end class RandomSample.Sorted - - /** - * Magic number 1: prime closest to phi, in 32 bits. - */ - private static final long PHI_32 = 2654435769L; - - /** - * Magic number 2: multiplicative inverse of PHI_32, modulo 2**32. - */ - private static final long PHI_32I = 340573321L; - - /** - * Switch to cause methods to return timings. - */ - private static boolean returnTimings = false; - - /** - * Self-test. - */ - public static void main(String[] args) throws Exception { - RandomSample.returnTimings = true; - /* - * Create an array of sequential integers, from which samples will be taken. - */ - final int COLLECTION_SIZE = 10 * 1000 * 1000; - ScoredDocIDs collection = createAllScoredDocs(COLLECTION_SIZE); - - /* - * Factor PHI. - * - int[] factors = RandomSample.factor(PHI_32); - System.out.print("Factors of PHI_32: "); - for (int k : factors) { - System.out.print(k+", "); - } - System.out.println(""); - - * Verify inverse relationship of PHI & phi. - * - boolean inverseValid = true; - for (int j = 0; j < Integer.MAX_VALUE; j++) { - int k = (int)(j * PHI_32) & 0x7FFFFFFF; - int m = (int)(k * PHI_32I) & 0X7FFFFFFF; - if (j != m) { - System.out.println("Inverse not valid for "+j); - inverseValid = false; - } - } - System.out.println("Inverse valid? "+inverseValid); - */ - /* - * Take samples of various sizes from the full set, verify no duplicates, - * check flatness. - */ - int[] sampleSizes = { - 10, 57, 100, 333, 1000, 2154, 10000 - }; - Algorithm[] algorithms = { Algorithm.HASHING, Algorithm.TRAVERSAL }; - for (int sampleSize : sampleSizes) { - for (Algorithm algorithm : algorithms) { - System.out.println("Sample size " + sampleSize - + ", algorithm " + algorithm + "..."); - /* - * Take the sample. - */ - int[] sample = RandomSample.repeatableSample( - collection, COLLECTION_SIZE, sampleSize, algorithm, Sorted.YES); - /* - * Check for duplicates. - */ - boolean noDups = true; - for (int j = 0; j < sampleSize - 1; j++) { - if (sample[j] == sample[j + 1]) { - System.out.println("Duplicate value " - + sample[j] + " at " + j + ", " - + (j + 1)); - noDups = false; - break; - } - } - if (noDups) { - System.out.println("No duplicates."); - } - if (algorithm == Algorithm.HASHING) { - System.out.print("Hashed sample, up to 100 of "+sampleSize+": "); - int lim = Math.min(100, sampleSize); - for (int k = 0; k < lim; k++) { - System.out.print(sample[k]+", "); - } - System.out.println(""); - } - /* - * Check flatness of distribution in sample. - */ - final int N_INTERVALS = 100; - int[] counts = RandomSample.countsBySubrange(sample, COLLECTION_SIZE, N_INTERVALS); - int minCount = Integer.MAX_VALUE; - int maxCount = Integer.MIN_VALUE; - int avgCount = 0; - for (int j = 0; j < N_INTERVALS; j++) { - int count = counts[j]; - if (count < minCount) { - minCount = count; - } - if (count > maxCount) { - maxCount = count; - } - avgCount += count; - } - avgCount /= N_INTERVALS; - System.out.println("Min, max, avg: "+minCount+", "+maxCount+", "+avgCount); - - if (((double)minCount - avgCount)/avgCount < -0.05 && (minCount - avgCount) < -5) { - System.out.println("Not flat enough."); - } else if (((double)maxCount - avgCount)/avgCount > 0.05 && (maxCount - avgCount) > 5) { - System.out.println("Not flat enough."); - } else { - System.out.println("Flat enough."); - } - if (sampleSize == 10544 && algorithm == Algorithm.TRAVERSAL) { - System.out.print("Counts of interest: "); - for (int j = 0; j < N_INTERVALS; j++) { - System.out.print(counts[j]+", "); - } - System.out.println(""); - } - } - } - System.out.println("Last prime is " - + RandomSample.primes[RandomSample.N_PRIMES - 1]); - } - - private static ScoredDocIDs createAllScoredDocs(final int COLLECTION_SIZE) - throws CorruptIndexException, LockObtainFailedException, IOException { - ScoredDocIDs collection; - - IndexReader reader = null; - Directory ramDir = new RAMDirectory(); - try { - IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_30, new KeywordAnalyzer())); - for (int i = 0; i < COLLECTION_SIZE; i++) { - writer.addDocument(new Document()); - } - writer.commit(); - writer.close(); - reader = IndexReader.open(ramDir); - collection = ScoredDocIdsUtils.createAllDocsScoredDocIDs(reader); - } finally { - if (reader != null) { - reader.close(); - } - ramDir.close(); - } - return collection; - } -} // end class RandomSample