+++ /dev/null
-package org.apache.lucene.facet.search.sampling;
-
-import java.io.IOException;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-
-import org.apache.lucene.index.IndexReader;
-
-import org.apache.lucene.facet.search.FacetArrays;
-import org.apache.lucene.facet.search.ScoredDocIDs;
-import org.apache.lucene.facet.search.aggregator.Aggregator;
-import org.apache.lucene.facet.search.params.FacetRequest;
-import org.apache.lucene.facet.search.params.FacetSearchParams;
-import org.apache.lucene.facet.search.results.FacetResult;
-import org.apache.lucene.facet.search.results.FacetResultNode;
-import org.apache.lucene.facet.search.results.MutableFacetResultNode;
-import org.apache.lucene.facet.taxonomy.TaxonomyReader;
-import org.apache.lucene.facet.util.RandomSample;
-import org.apache.lucene.facet.util.ScoredDocIdsUtils;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Sampling definition for facets accumulation
- * <p>
- * The Sampler uses TAKMI style counting to provide a 'best guess' top-K result
- * set of the facets accumulated.
- * <p>
- * Note: Sampling accumulation (Accumulation over a sampled-set of the results),
- * does not guarantee accurate values for
- * {@link FacetResult#getNumValidDescendants()} &
- * {@link FacetResultNode#getResidue()}.
- *
- * @lucene.experimental
- */
-public class Sampler {
-
- private static final Logger logger = Logger.getLogger(Sampler.class.getName());
-
- private final SamplingParams samplingParams;
-
- /**
- * Construct with {@link SamplingParams}
- */
- public Sampler() {
- this(new SamplingParams());
- }
-
- /**
- * Construct with certain {@link SamplingParams}
- * @param params sampling params in effect
- * @throws IllegalArgumentException if the provided SamplingParams are not valid
- */
- public Sampler(SamplingParams params) throws IllegalArgumentException {
- if (!params.validate()) {
- throw new IllegalArgumentException("The provided SamplingParams are not valid!!");
- }
- this.samplingParams = params;
- }
-
- /**
- * Check if this sampler would complement for the input docIds
- */
- public boolean shouldSample(ScoredDocIDs docIds) {
- return docIds.size() > samplingParams.getSamplingThreshold();
- }
-
- /**
- * Compute a sample set out of the input set, based on the {@link SamplingParams#getSampleRatio()}
- * in effect. Sub classes can override to alter how the sample set is
- * computed.
- * <p>
- * If the input set is of size smaller than {@link SamplingParams#getMinSampleSize()},
- * the input set is returned (no sampling takes place).
- * <p>
- * Other than that, the returned set size will not be larger than {@link SamplingParams#getMaxSampleSize()}
- * nor smaller than {@link SamplingParams#getMinSampleSize()}.
- * @param docids
- * full set of matching documents out of which a sample is needed.
- */
- public SampleResult getSampleSet(ScoredDocIDs docids) throws IOException {
- if (!shouldSample(docids)) {
- return new SampleResult(docids, 1d);
- }
-
- int actualSize = docids.size();
- int sampleSetSize = (int) (actualSize * samplingParams.getSampleRatio());
- sampleSetSize = Math.max(sampleSetSize, samplingParams.getMinSampleSize());
- sampleSetSize = Math.min(sampleSetSize, samplingParams.getMaxSampleSize());
-
- int[] sampleSet = null;
- try {
- sampleSet = RandomSample.repeatableSample(docids, actualSize,
- sampleSetSize);
- } catch (IOException e) {
- if (logger.isLoggable(Level.WARNING)) {
- logger.log(Level.WARNING, "sampling failed: "+e.getMessage()+" - falling back to no sampling!", e);
- }
- return new SampleResult(docids, 1d);
- }
-
- ScoredDocIDs sampled = ScoredDocIdsUtils.createScoredDocIDsSubset(docids,
- sampleSet);
- if (logger.isLoggable(Level.FINEST)) {
- logger.finest("******************** " + sampled.size());
- }
- return new SampleResult(sampled, sampled.size()/(double)docids.size());
- }
-
- /**
- * Get a fixer of sample facet accumulation results. Default implementation
- * returns a <code>TakmiSampleFixer</code> which is adequate only for
- * counting. For any other accumulator, provide a different fixer.
- */
- public SampleFixer getSampleFixer(
- IndexReader indexReader, TaxonomyReader taxonomyReader,
- FacetSearchParams searchParams) {
- return new TakmiSampleFixer(indexReader, taxonomyReader, searchParams);
- }
-
- /**
- * Result of sample computation
- */
- public final static class SampleResult {
- public final ScoredDocIDs docids;
- public final double actualSampleRatio;
- protected SampleResult(ScoredDocIDs docids, double actualSampleRatio) {
- this.docids = docids;
- this.actualSampleRatio = actualSampleRatio;
- }
- }
-
- /**
- * Return the sampling params in effect
- */
- public final SamplingParams getSamplingParams() {
- return samplingParams;
- }
-
- /**
- * Trim the input facet result.<br>
- * Note: It is only valid to call this method with result obtained for a
- * facet request created through {@link #overSampledSearchParams(FacetSearchParams)}.
- *
- * @throws IllegalArgumentException
- * if called with results not obtained for requests created
- * through {@link #overSampledSearchParams(FacetSearchParams)}
- */
- public FacetResult trimResult(FacetResult facetResult) throws IllegalArgumentException {
- double overSampleFactor = getSamplingParams().getOversampleFactor();
- if (overSampleFactor <= 1) { // no factoring done?
- return facetResult;
- }
-
- OverSampledFacetRequest sampledFreq = null;
-
- try {
- sampledFreq = (OverSampledFacetRequest)facetResult.getFacetRequest();
- } catch (ClassCastException e) {
- throw new IllegalArgumentException(
- "It is only valid to call this method with result obtained for a" +
- "facet request created through sampler.overSamlpingSearchParams()",
- e);
- }
-
- FacetRequest origFrq = sampledFreq.orig;
-
- MutableFacetResultNode trimmedRootNode = MutableFacetResultNode.toImpl(facetResult.getFacetResultNode());
- trimmedRootNode.trimSubResults(origFrq.getNumResults());
-
- return new FacetResult(origFrq, trimmedRootNode, facetResult.getNumValidDescendants());
- }
-
- /**
- * Over-sampled search params, wrapping each request with an over-sampled one.
- */
- public FacetSearchParams overSampledSearchParams(FacetSearchParams original) {
- FacetSearchParams res = original;
- // So now we can sample -> altering the searchParams to accommodate for the statistical error for the sampling
- double overSampleFactor = getSamplingParams().getOversampleFactor();
- if (overSampleFactor > 1) { // any factoring to do?
- res = new FacetSearchParams(original.getFacetIndexingParams());
- for (FacetRequest frq: original.getFacetRequests()) {
- int overSampledNumResults = (int) Math.ceil(frq.getNumResults() * overSampleFactor);
- res.addFacetRequest(new OverSampledFacetRequest(frq, overSampledNumResults));
- }
- }
- return res;
- }
-
- /**
- * Wrapping a facet request for over sampling.
- * Implementation detail: even if the original request is a count request, no
- * statistics will be computed for it as the wrapping is not a count request.
- * This is ok, as the sampling accumulator is later computing the statistics
- * over the original requests.
- */
- private static class OverSampledFacetRequest extends FacetRequest {
- final FacetRequest orig;
- public OverSampledFacetRequest(FacetRequest orig, int num) {
- super(orig.getCategoryPath(), num);
- this.orig = orig;
- }
-
- @Override
- public Aggregator createAggregator(boolean useComplements,
- FacetArrays arrays, IndexReader indexReader,
- TaxonomyReader taxonomy) throws IOException {
- return orig.createAggregator(useComplements, arrays, indexReader,
- taxonomy);
- }
-
- @Override
- public double getValueOf(FacetArrays arrays, int idx) {
- return orig.getValueOf(arrays, idx);
- }
-
- @Override
- public boolean requireDocumentScore() {
- return orig.requireDocumentScore();
- }
- }
-}