X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/search/sampling/Sampler.java diff --git a/lucene-java-3.4.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/search/sampling/Sampler.java b/lucene-java-3.4.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/search/sampling/Sampler.java deleted file mode 100644 index debebea..0000000 --- a/lucene-java-3.4.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/search/sampling/Sampler.java +++ /dev/null @@ -1,238 +0,0 @@ -package org.apache.lucene.facet.search.sampling; - -import java.io.IOException; -import java.util.logging.Level; -import java.util.logging.Logger; - -import org.apache.lucene.index.IndexReader; - -import org.apache.lucene.facet.search.FacetArrays; -import org.apache.lucene.facet.search.ScoredDocIDs; -import org.apache.lucene.facet.search.aggregator.Aggregator; -import org.apache.lucene.facet.search.params.FacetRequest; -import org.apache.lucene.facet.search.params.FacetSearchParams; -import org.apache.lucene.facet.search.results.FacetResult; -import org.apache.lucene.facet.search.results.FacetResultNode; -import org.apache.lucene.facet.search.results.MutableFacetResultNode; -import org.apache.lucene.facet.taxonomy.TaxonomyReader; -import org.apache.lucene.facet.util.RandomSample; -import org.apache.lucene.facet.util.ScoredDocIdsUtils; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Sampling definition for facets accumulation - *

- * The Sampler uses TAKMI style counting to provide a 'best guess' top-K result - * set of the facets accumulated. - *

- * Note: Sampling accumulation (Accumulation over a sampled-set of the results), - * does not guarantee accurate values for - * {@link FacetResult#getNumValidDescendants()} & - * {@link FacetResultNode#getResidue()}. - * - * @lucene.experimental - */ -public class Sampler { - - private static final Logger logger = Logger.getLogger(Sampler.class.getName()); - - private final SamplingParams samplingParams; - - /** - * Construct with {@link SamplingParams} - */ - public Sampler() { - this(new SamplingParams()); - } - - /** - * Construct with certain {@link SamplingParams} - * @param params sampling params in effect - * @throws IllegalArgumentException if the provided SamplingParams are not valid - */ - public Sampler(SamplingParams params) throws IllegalArgumentException { - if (!params.validate()) { - throw new IllegalArgumentException("The provided SamplingParams are not valid!!"); - } - this.samplingParams = params; - } - - /** - * Check if this sampler would complement for the input docIds - */ - public boolean shouldSample(ScoredDocIDs docIds) { - return docIds.size() > samplingParams.getSamplingThreshold(); - } - - /** - * Compute a sample set out of the input set, based on the {@link SamplingParams#getSampleRatio()} - * in effect. Sub classes can override to alter how the sample set is - * computed. - *

- * If the input set is of size smaller than {@link SamplingParams#getMinSampleSize()}, - * the input set is returned (no sampling takes place). - *

- * Other than that, the returned set size will not be larger than {@link SamplingParams#getMaxSampleSize()} - * nor smaller than {@link SamplingParams#getMinSampleSize()}. - * @param docids - * full set of matching documents out of which a sample is needed. - */ - public SampleResult getSampleSet(ScoredDocIDs docids) throws IOException { - if (!shouldSample(docids)) { - return new SampleResult(docids, 1d); - } - - int actualSize = docids.size(); - int sampleSetSize = (int) (actualSize * samplingParams.getSampleRatio()); - sampleSetSize = Math.max(sampleSetSize, samplingParams.getMinSampleSize()); - sampleSetSize = Math.min(sampleSetSize, samplingParams.getMaxSampleSize()); - - int[] sampleSet = null; - try { - sampleSet = RandomSample.repeatableSample(docids, actualSize, - sampleSetSize); - } catch (IOException e) { - if (logger.isLoggable(Level.WARNING)) { - logger.log(Level.WARNING, "sampling failed: "+e.getMessage()+" - falling back to no sampling!", e); - } - return new SampleResult(docids, 1d); - } - - ScoredDocIDs sampled = ScoredDocIdsUtils.createScoredDocIDsSubset(docids, - sampleSet); - if (logger.isLoggable(Level.FINEST)) { - logger.finest("******************** " + sampled.size()); - } - return new SampleResult(sampled, sampled.size()/(double)docids.size()); - } - - /** - * Get a fixer of sample facet accumulation results. Default implementation - * returns a TakmiSampleFixer which is adequate only for - * counting. For any other accumulator, provide a different fixer. - */ - public SampleFixer getSampleFixer( - IndexReader indexReader, TaxonomyReader taxonomyReader, - FacetSearchParams searchParams) { - return new TakmiSampleFixer(indexReader, taxonomyReader, searchParams); - } - - /** - * Result of sample computation - */ - public final static class SampleResult { - public final ScoredDocIDs docids; - public final double actualSampleRatio; - protected SampleResult(ScoredDocIDs docids, double actualSampleRatio) { - this.docids = docids; - this.actualSampleRatio = actualSampleRatio; - } - } - - /** - * Return the sampling params in effect - */ - public final SamplingParams getSamplingParams() { - return samplingParams; - } - - /** - * Trim the input facet result.
- * Note: It is only valid to call this method with result obtained for a - * facet request created through {@link #overSampledSearchParams(FacetSearchParams)}. - * - * @throws IllegalArgumentException - * if called with results not obtained for requests created - * through {@link #overSampledSearchParams(FacetSearchParams)} - */ - public FacetResult trimResult(FacetResult facetResult) throws IllegalArgumentException { - double overSampleFactor = getSamplingParams().getOversampleFactor(); - if (overSampleFactor <= 1) { // no factoring done? - return facetResult; - } - - OverSampledFacetRequest sampledFreq = null; - - try { - sampledFreq = (OverSampledFacetRequest)facetResult.getFacetRequest(); - } catch (ClassCastException e) { - throw new IllegalArgumentException( - "It is only valid to call this method with result obtained for a" + - "facet request created through sampler.overSamlpingSearchParams()", - e); - } - - FacetRequest origFrq = sampledFreq.orig; - - MutableFacetResultNode trimmedRootNode = MutableFacetResultNode.toImpl(facetResult.getFacetResultNode()); - trimmedRootNode.trimSubResults(origFrq.getNumResults()); - - return new FacetResult(origFrq, trimmedRootNode, facetResult.getNumValidDescendants()); - } - - /** - * Over-sampled search params, wrapping each request with an over-sampled one. - */ - public FacetSearchParams overSampledSearchParams(FacetSearchParams original) { - FacetSearchParams res = original; - // So now we can sample -> altering the searchParams to accommodate for the statistical error for the sampling - double overSampleFactor = getSamplingParams().getOversampleFactor(); - if (overSampleFactor > 1) { // any factoring to do? - res = new FacetSearchParams(original.getFacetIndexingParams()); - for (FacetRequest frq: original.getFacetRequests()) { - int overSampledNumResults = (int) Math.ceil(frq.getNumResults() * overSampleFactor); - res.addFacetRequest(new OverSampledFacetRequest(frq, overSampledNumResults)); - } - } - return res; - } - - /** - * Wrapping a facet request for over sampling. - * Implementation detail: even if the original request is a count request, no - * statistics will be computed for it as the wrapping is not a count request. - * This is ok, as the sampling accumulator is later computing the statistics - * over the original requests. - */ - private static class OverSampledFacetRequest extends FacetRequest { - final FacetRequest orig; - public OverSampledFacetRequest(FacetRequest orig, int num) { - super(orig.getCategoryPath(), num); - this.orig = orig; - } - - @Override - public Aggregator createAggregator(boolean useComplements, - FacetArrays arrays, IndexReader indexReader, - TaxonomyReader taxonomy) throws IOException { - return orig.createAggregator(useComplements, arrays, indexReader, - taxonomy); - } - - @Override - public double getValueOf(FacetArrays arrays, int idx) { - return orig.getValueOf(arrays, idx); - } - - @Override - public boolean requireDocumentScore() { - return orig.requireDocumentScore(); - } - } -}