1 package org.apache.lucene.facet.search.sampling;
3 import java.io.IOException;
4 import java.util.logging.Level;
5 import java.util.logging.Logger;
7 import org.apache.lucene.index.IndexReader;
9 import org.apache.lucene.facet.search.FacetArrays;
10 import org.apache.lucene.facet.search.ScoredDocIDs;
11 import org.apache.lucene.facet.search.aggregator.Aggregator;
12 import org.apache.lucene.facet.search.params.FacetRequest;
13 import org.apache.lucene.facet.search.params.FacetSearchParams;
14 import org.apache.lucene.facet.search.results.FacetResult;
15 import org.apache.lucene.facet.search.results.FacetResultNode;
16 import org.apache.lucene.facet.search.results.MutableFacetResultNode;
17 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
18 import org.apache.lucene.facet.util.RandomSample;
19 import org.apache.lucene.facet.util.ScoredDocIdsUtils;
22 * Licensed to the Apache Software Foundation (ASF) under one or more
23 * contributor license agreements. See the NOTICE file distributed with
24 * this work for additional information regarding copyright ownership.
25 * The ASF licenses this file to You under the Apache License, Version 2.0
26 * (the "License"); you may not use this file except in compliance with
27 * the License. You may obtain a copy of the License at
29 * http://www.apache.org/licenses/LICENSE-2.0
31 * Unless required by applicable law or agreed to in writing, software
32 * distributed under the License is distributed on an "AS IS" BASIS,
33 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
34 * See the License for the specific language governing permissions and
35 * limitations under the License.
39 * Sampling definition for facets accumulation
41 * The Sampler uses TAKMI style counting to provide a 'best guess' top-K result
42 * set of the facets accumulated.
44 * Note: Sampling accumulation (Accumulation over a sampled-set of the results),
45 * does not guarantee accurate values for
46 * {@link FacetResult#getNumValidDescendants()} &
47 * {@link FacetResultNode#getResidue()}.
49 * @lucene.experimental
51 public class Sampler {
53 private static final Logger logger = Logger.getLogger(Sampler.class.getName());
55 private final SamplingParams samplingParams;
58 * Construct with {@link SamplingParams}
61 this(new SamplingParams());
65 * Construct with certain {@link SamplingParams}
66 * @param params sampling params in effect
67 * @throws IllegalArgumentException if the provided SamplingParams are not valid
69 public Sampler(SamplingParams params) throws IllegalArgumentException {
70 if (!params.validate()) {
71 throw new IllegalArgumentException("The provided SamplingParams are not valid!!");
73 this.samplingParams = params;
77 * Check if this sampler would complement for the input docIds
79 public boolean shouldSample(ScoredDocIDs docIds) {
80 return docIds.size() > samplingParams.getSamplingThreshold();
84 * Compute a sample set out of the input set, based on the {@link SamplingParams#getSampleRatio()}
85 * in effect. Sub classes can override to alter how the sample set is
88 * If the input set is of size smaller than {@link SamplingParams#getMinSampleSize()},
89 * the input set is returned (no sampling takes place).
91 * Other than that, the returned set size will not be larger than {@link SamplingParams#getMaxSampleSize()}
92 * nor smaller than {@link SamplingParams#getMinSampleSize()}.
94 * full set of matching documents out of which a sample is needed.
96 public SampleResult getSampleSet(ScoredDocIDs docids) throws IOException {
97 if (!shouldSample(docids)) {
98 return new SampleResult(docids, 1d);
101 int actualSize = docids.size();
102 int sampleSetSize = (int) (actualSize * samplingParams.getSampleRatio());
103 sampleSetSize = Math.max(sampleSetSize, samplingParams.getMinSampleSize());
104 sampleSetSize = Math.min(sampleSetSize, samplingParams.getMaxSampleSize());
106 int[] sampleSet = null;
108 sampleSet = RandomSample.repeatableSample(docids, actualSize,
110 } catch (IOException e) {
111 if (logger.isLoggable(Level.WARNING)) {
112 logger.log(Level.WARNING, "sampling failed: "+e.getMessage()+" - falling back to no sampling!", e);
114 return new SampleResult(docids, 1d);
117 ScoredDocIDs sampled = ScoredDocIdsUtils.createScoredDocIDsSubset(docids,
119 if (logger.isLoggable(Level.FINEST)) {
120 logger.finest("******************** " + sampled.size());
122 return new SampleResult(sampled, sampled.size()/(double)docids.size());
126 * Get a fixer of sample facet accumulation results. Default implementation
127 * returns a <code>TakmiSampleFixer</code> which is adequate only for
128 * counting. For any other accumulator, provide a different fixer.
130 public SampleFixer getSampleFixer(
131 IndexReader indexReader, TaxonomyReader taxonomyReader,
132 FacetSearchParams searchParams) {
133 return new TakmiSampleFixer(indexReader, taxonomyReader, searchParams);
137 * Result of sample computation
139 public final static class SampleResult {
140 public final ScoredDocIDs docids;
141 public final double actualSampleRatio;
142 protected SampleResult(ScoredDocIDs docids, double actualSampleRatio) {
143 this.docids = docids;
144 this.actualSampleRatio = actualSampleRatio;
149 * Return the sampling params in effect
151 public final SamplingParams getSamplingParams() {
152 return samplingParams;
156 * Trim the input facet result.<br>
157 * Note: It is only valid to call this method with result obtained for a
158 * facet request created through {@link #overSampledSearchParams(FacetSearchParams)}.
160 * @throws IllegalArgumentException
161 * if called with results not obtained for requests created
162 * through {@link #overSampledSearchParams(FacetSearchParams)}
164 public FacetResult trimResult(FacetResult facetResult) throws IllegalArgumentException {
165 double overSampleFactor = getSamplingParams().getOversampleFactor();
166 if (overSampleFactor <= 1) { // no factoring done?
170 OverSampledFacetRequest sampledFreq = null;
173 sampledFreq = (OverSampledFacetRequest)facetResult.getFacetRequest();
174 } catch (ClassCastException e) {
175 throw new IllegalArgumentException(
176 "It is only valid to call this method with result obtained for a" +
177 "facet request created through sampler.overSamlpingSearchParams()",
181 FacetRequest origFrq = sampledFreq.orig;
183 MutableFacetResultNode trimmedRootNode = MutableFacetResultNode.toImpl(facetResult.getFacetResultNode());
184 trimmedRootNode.trimSubResults(origFrq.getNumResults());
186 return new FacetResult(origFrq, trimmedRootNode, facetResult.getNumValidDescendants());
190 * Over-sampled search params, wrapping each request with an over-sampled one.
192 public FacetSearchParams overSampledSearchParams(FacetSearchParams original) {
193 FacetSearchParams res = original;
194 // So now we can sample -> altering the searchParams to accommodate for the statistical error for the sampling
195 double overSampleFactor = getSamplingParams().getOversampleFactor();
196 if (overSampleFactor > 1) { // any factoring to do?
197 res = new FacetSearchParams(original.getFacetIndexingParams());
198 for (FacetRequest frq: original.getFacetRequests()) {
199 int overSampledNumResults = (int) Math.ceil(frq.getNumResults() * overSampleFactor);
200 res.addFacetRequest(new OverSampledFacetRequest(frq, overSampledNumResults));
207 * Wrapping a facet request for over sampling.
208 * Implementation detail: even if the original request is a count request, no
209 * statistics will be computed for it as the wrapping is not a count request.
210 * This is ok, as the sampling accumulator is later computing the statistics
211 * over the original requests.
213 private static class OverSampledFacetRequest extends FacetRequest {
214 final FacetRequest orig;
215 public OverSampledFacetRequest(FacetRequest orig, int num) {
216 super(orig.getCategoryPath(), num);
221 public Aggregator createAggregator(boolean useComplements,
222 FacetArrays arrays, IndexReader indexReader,
223 TaxonomyReader taxonomy) throws IOException {
224 return orig.createAggregator(useComplements, arrays, indexReader,
229 public double getValueOf(FacetArrays arrays, int idx) {
230 return orig.getValueOf(arrays, idx);
234 public boolean requireDocumentScore() {
235 return orig.requireDocumentScore();