1 package org.apache.lucene.facet.search.sampling;
3 import java.io.IOException;
5 import org.apache.lucene.index.IndexReader;
7 import org.apache.lucene.facet.search.FacetArrays;
8 import org.apache.lucene.facet.search.ScoredDocIDs;
9 import org.apache.lucene.facet.search.aggregator.Aggregator;
10 import org.apache.lucene.facet.search.params.FacetRequest;
11 import org.apache.lucene.facet.search.params.FacetSearchParams;
12 import org.apache.lucene.facet.search.results.FacetResult;
13 import org.apache.lucene.facet.search.results.FacetResultNode;
14 import org.apache.lucene.facet.search.results.MutableFacetResultNode;
15 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
18 * Licensed to the Apache Software Foundation (ASF) under one or more
19 * contributor license agreements. See the NOTICE file distributed with
20 * this work for additional information regarding copyright ownership.
21 * The ASF licenses this file to You under the Apache License, Version 2.0
22 * (the "License"); you may not use this file except in compliance with
23 * the License. You may obtain a copy of the License at
25 * http://www.apache.org/licenses/LICENSE-2.0
27 * Unless required by applicable law or agreed to in writing, software
28 * distributed under the License is distributed on an "AS IS" BASIS,
29 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
30 * See the License for the specific language governing permissions and
31 * limitations under the License.
35 * Sampling definition for facets accumulation
37 * The Sampler uses TAKMI style counting to provide a 'best guess' top-K result
38 * set of the facets accumulated.
40 * Note: Sampling accumulation (Accumulation over a sampled-set of the results),
41 * does not guarantee accurate values for
42 * {@link FacetResult#getNumValidDescendants()} &
43 * {@link FacetResultNode#getResidue()}.
45 * @lucene.experimental
47 public abstract class Sampler {
49 protected final SamplingParams samplingParams;
52 * Construct with {@link SamplingParams}
55 this(new SamplingParams());
59 * Construct with certain {@link SamplingParams}
60 * @param params sampling params in effect
61 * @throws IllegalArgumentException if the provided SamplingParams are not valid
63 public Sampler(SamplingParams params) throws IllegalArgumentException {
64 if (!params.validate()) {
65 throw new IllegalArgumentException("The provided SamplingParams are not valid!!");
67 this.samplingParams = params;
71 * Check if this sampler would complement for the input docIds
73 public boolean shouldSample(ScoredDocIDs docIds) {
74 return docIds.size() > samplingParams.getSamplingThreshold();
78 * Compute a sample set out of the input set, based on the {@link SamplingParams#getSampleRatio()}
79 * in effect. Sub classes can override to alter how the sample set is
82 * If the input set is of size smaller than {@link SamplingParams#getMinSampleSize()},
83 * the input set is returned (no sampling takes place).
85 * Other than that, the returned set size will not be larger than {@link SamplingParams#getMaxSampleSize()}
86 * nor smaller than {@link SamplingParams#getMinSampleSize()}.
88 * full set of matching documents out of which a sample is needed.
90 public SampleResult getSampleSet(ScoredDocIDs docids) throws IOException {
91 if (!shouldSample(docids)) {
92 return new SampleResult(docids, 1d);
95 int actualSize = docids.size();
96 int sampleSetSize = (int) (actualSize * samplingParams.getSampleRatio());
97 sampleSetSize = Math.max(sampleSetSize, samplingParams.getMinSampleSize());
98 sampleSetSize = Math.min(sampleSetSize, samplingParams.getMaxSampleSize());
100 return createSample(docids, actualSize, sampleSetSize);
104 * Create and return a sample of the input set
105 * @param docids input set out of which a sample is to be created
106 * @param actualSize original size of set, prior to sampling
107 * @param sampleSetSize required size of sample set
108 * @return sample of the input set in the required size
110 protected abstract SampleResult createSample(ScoredDocIDs docids, int actualSize,
111 int sampleSetSize) throws IOException;
114 * Get a fixer of sample facet accumulation results. Default implementation
115 * returns a <code>TakmiSampleFixer</code> which is adequate only for
116 * counting. For any other accumulator, provide a different fixer.
118 public SampleFixer getSampleFixer(
119 IndexReader indexReader, TaxonomyReader taxonomyReader,
120 FacetSearchParams searchParams) {
121 return new TakmiSampleFixer(indexReader, taxonomyReader, searchParams);
125 * Result of sample computation
127 public final static class SampleResult {
128 public final ScoredDocIDs docids;
129 public final double actualSampleRatio;
130 protected SampleResult(ScoredDocIDs docids, double actualSampleRatio) {
131 this.docids = docids;
132 this.actualSampleRatio = actualSampleRatio;
137 * Return the sampling params in effect
139 public final SamplingParams getSamplingParams() {
140 return samplingParams;
144 * Trim the input facet result.<br>
145 * Note: It is only valid to call this method with result obtained for a
146 * facet request created through {@link #overSampledSearchParams(FacetSearchParams)}.
148 * @throws IllegalArgumentException
149 * if called with results not obtained for requests created
150 * through {@link #overSampledSearchParams(FacetSearchParams)}
152 public FacetResult trimResult(FacetResult facetResult) throws IllegalArgumentException {
153 double overSampleFactor = getSamplingParams().getOversampleFactor();
154 if (overSampleFactor <= 1) { // no factoring done?
158 OverSampledFacetRequest sampledFreq = null;
161 sampledFreq = (OverSampledFacetRequest)facetResult.getFacetRequest();
162 } catch (ClassCastException e) {
163 throw new IllegalArgumentException(
164 "It is only valid to call this method with result obtained for a" +
165 "facet request created through sampler.overSamlpingSearchParams()",
169 FacetRequest origFrq = sampledFreq.orig;
171 MutableFacetResultNode trimmedRootNode = MutableFacetResultNode.toImpl(facetResult.getFacetResultNode());
172 trimmedRootNode.trimSubResults(origFrq.getNumResults());
174 return new FacetResult(origFrq, trimmedRootNode, facetResult.getNumValidDescendants());
178 * Over-sampled search params, wrapping each request with an over-sampled one.
180 public FacetSearchParams overSampledSearchParams(FacetSearchParams original) {
181 FacetSearchParams res = original;
182 // So now we can sample -> altering the searchParams to accommodate for the statistical error for the sampling
183 double overSampleFactor = getSamplingParams().getOversampleFactor();
184 if (overSampleFactor > 1) { // any factoring to do?
185 res = new FacetSearchParams(original.getFacetIndexingParams());
186 for (FacetRequest frq: original.getFacetRequests()) {
187 int overSampledNumResults = (int) Math.ceil(frq.getNumResults() * overSampleFactor);
188 res.addFacetRequest(new OverSampledFacetRequest(frq, overSampledNumResults));
195 * Wrapping a facet request for over sampling.
196 * Implementation detail: even if the original request is a count request, no
197 * statistics will be computed for it as the wrapping is not a count request.
198 * This is ok, as the sampling accumulator is later computing the statistics
199 * over the original requests.
201 private static class OverSampledFacetRequest extends FacetRequest {
202 final FacetRequest orig;
203 public OverSampledFacetRequest(FacetRequest orig, int num) {
204 super(orig.getCategoryPath(), num);
209 public Aggregator createAggregator(boolean useComplements,
210 FacetArrays arrays, IndexReader indexReader,
211 TaxonomyReader taxonomy) throws IOException {
212 return orig.createAggregator(useComplements, arrays, indexReader,
217 public double getValueOf(FacetArrays arrays, int idx) {
218 return orig.getValueOf(arrays, idx);
222 public boolean requireDocumentScore() {
223 return orig.requireDocumentScore();