1 package org.apache.lucene.facet.search.sampling;
3 import java.io.IOException;
5 import java.util.Random;
7 import org.apache.lucene.index.IndexReader;
8 import org.apache.lucene.index.Term;
9 import org.apache.lucene.search.Query;
10 import org.apache.lucene.search.TermQuery;
12 import org.apache.lucene.search.MultiCollector;
13 import org.apache.lucene.facet.search.BaseTestTopK;
14 import org.apache.lucene.facet.search.FacetsAccumulator;
15 import org.apache.lucene.facet.search.FacetsCollector;
16 import org.apache.lucene.facet.search.ScoredDocIDs;
17 import org.apache.lucene.facet.search.ScoredDocIdCollector;
18 import org.apache.lucene.facet.search.params.FacetSearchParams;
19 import org.apache.lucene.facet.search.results.FacetResult;
20 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
23 * Licensed to the Apache Software Foundation (ASF) under one or more
24 * contributor license agreements. See the NOTICE file distributed with
25 * this work for additional information regarding copyright ownership.
26 * The ASF licenses this file to You under the Apache License, Version 2.0
27 * (the "License"); you may not use this file except in compliance with
28 * the License. You may obtain a copy of the License at
30 * http://www.apache.org/licenses/LICENSE-2.0
32 * Unless required by applicable law or agreed to in writing, software
33 * distributed under the License is distributed on an "AS IS" BASIS,
34 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
35 * See the License for the specific language governing permissions and
36 * limitations under the License.
39 public abstract class BaseSampleTestTopK extends BaseTestTopK {
41 /** Number of top results */
42 protected static final int K = 2;
44 /** since there is a chance that this test would fail even if the code is correct, retry the sampling */
45 protected static final int RETRIES = 10;
47 protected abstract FacetsAccumulator getSamplingAccumulator(Sampler sampler,
48 TaxonomyReader taxoReader, IndexReader indexReader,
49 FacetSearchParams searchParams);
52 * Try out faceted search with sampling enabled and complements either disabled or enforced
53 * Lots of randomly generated data is being indexed, and later on a "90% docs" faceted search
54 * is performed. The results are compared to non-sampled ones.
56 public void testCountUsingSamping() throws Exception, IOException {
57 boolean useRandomSampler = random.nextBoolean();
58 for (int partitionSize : partitionSizes) {
60 initIndex(partitionSize);
61 // Get all of the documents and run the query, then do different
62 // facet counts and compare to control
63 Query q = new TermQuery(new Term(CONTENT_FIELD, BETA)); // 90% of the docs
64 ScoredDocIdCollector docCollector = ScoredDocIdCollector.create(searcher.maxDoc(), false);
66 FacetSearchParams expectedSearchParams = searchParamsWithRequests(K, partitionSize);
67 FacetsCollector fc = new FacetsCollector(expectedSearchParams, indexReader, taxoReader);
69 searcher.search(q, MultiCollector.wrap(docCollector, fc));
71 List<FacetResult> expectedResults = fc.getFacetResults();
73 FacetSearchParams samplingSearchParams = searchParamsWithRequests(K, partitionSize);
75 // try several times in case of failure, because the test has a chance to fail
76 // if the top K facets are not sufficiently common with the sample set
77 for (int nTrial=0; nTrial<RETRIES; nTrial++) {
79 // complement with sampling!
80 final Sampler sampler = createSampler(nTrial, docCollector.getScoredDocIDs(), useRandomSampler);
82 assertSampling(expectedResults, q, sampler, samplingSearchParams, false);
83 assertSampling(expectedResults, q, sampler, samplingSearchParams, true);
86 } catch (NotSameResultError e) {
87 if (nTrial>=RETRIES-1) {
88 throw e; // no more retries allowed, must fail
98 private void assertSampling(List<FacetResult> expected, Query q, Sampler sampler, FacetSearchParams params, boolean complement) throws Exception {
99 FacetsCollector samplingFC = samplingCollector(complement, sampler, params);
101 searcher.search(q, samplingFC);
102 List<FacetResult> sampledResults = samplingFC.getFacetResults();
104 assertSameResults(expected, sampledResults);
107 private FacetsCollector samplingCollector(
108 final boolean complement,
109 final Sampler sampler,
110 FacetSearchParams samplingSearchParams) {
111 FacetsCollector samplingFC = new FacetsCollector(samplingSearchParams, indexReader, taxoReader) {
113 protected FacetsAccumulator initFacetsAccumulator(
114 FacetSearchParams facetSearchParams, IndexReader indexReader,
115 TaxonomyReader taxonomyReader) {
116 FacetsAccumulator acc = getSamplingAccumulator(sampler, taxonomyReader, indexReader, facetSearchParams);
117 acc.setComplementThreshold(complement ? FacetsAccumulator.FORCE_COMPLEMENT : FacetsAccumulator.DISABLE_COMPLEMENT);
124 private Sampler createSampler(int nTrial, ScoredDocIDs scoredDocIDs, boolean useRandomSampler) {
125 SamplingParams samplingParams = new SamplingParams();
127 final double retryFactor = Math.pow(1.01, nTrial);
128 samplingParams.setSampleRatio(0.8 * retryFactor);
129 samplingParams.setMinSampleSize((int) (100 * retryFactor));
130 samplingParams.setMaxSampleSize((int) (10000 * retryFactor));
131 samplingParams.setOversampleFactor(5.0 * retryFactor);
133 samplingParams.setSampingThreshold(11000); //force sampling
134 Sampler sampler = useRandomSampler ?
135 new RandomSampler(samplingParams, new Random(random.nextLong())) :
136 new RepeatableSampler(samplingParams);
137 assertTrue("must enable sampling for this test!",sampler.shouldSample(scoredDocIDs));