1 package org.apache.lucene.facet.search.sampling;
3 import java.io.IOException;
6 import org.apache.lucene.index.IndexReader;
7 import org.apache.lucene.index.Term;
8 import org.apache.lucene.search.Query;
9 import org.apache.lucene.search.TermQuery;
11 import org.apache.lucene.search.MultiCollector;
12 import org.apache.lucene.facet.search.BaseTestTopK;
13 import org.apache.lucene.facet.search.FacetsAccumulator;
14 import org.apache.lucene.facet.search.FacetsCollector;
15 import org.apache.lucene.facet.search.ScoredDocIDs;
16 import org.apache.lucene.facet.search.ScoredDocIdCollector;
17 import org.apache.lucene.facet.search.params.FacetSearchParams;
18 import org.apache.lucene.facet.search.results.FacetResult;
19 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
22 * Licensed to the Apache Software Foundation (ASF) under one or more
23 * contributor license agreements. See the NOTICE file distributed with
24 * this work for additional information regarding copyright ownership.
25 * The ASF licenses this file to You under the Apache License, Version 2.0
26 * (the "License"); you may not use this file except in compliance with
27 * the License. You may obtain a copy of the License at
29 * http://www.apache.org/licenses/LICENSE-2.0
31 * Unless required by applicable law or agreed to in writing, software
32 * distributed under the License is distributed on an "AS IS" BASIS,
33 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
34 * See the License for the specific language governing permissions and
35 * limitations under the License.
38 public abstract class BaseSampleTestTopK extends BaseTestTopK {
40 /** Number of top results */
41 protected static final int K = 2;
43 /** since there is a chance that this test would fail even if the code is correct, retry the sampling */
44 protected static final int RETRIES = 4;
46 protected abstract FacetsAccumulator getSamplingAccumulator(Sampler sampler,
47 TaxonomyReader taxoReader, IndexReader indexReader,
48 FacetSearchParams searchParams);
51 * Try out faceted search with sampling enabled and complements either disabled or enforced
52 * Lots of randomly generated data is being indexed, and later on a "90% docs" faceted search
53 * is performed. The results are compared to non-sampled ones.
55 public void testCountUsingSamping() throws Exception, IOException {
56 for (int partitionSize : partitionSizes) {
57 initIndex(partitionSize);
59 // Get all of the documents and run the query, then do different
60 // facet counts and compare to control
61 Query q = new TermQuery(new Term(CONTENT_FIELD, BETA)); // 90% of the docs
62 ScoredDocIdCollector docCollector = ScoredDocIdCollector.create(searcher.maxDoc(), false);
64 FacetSearchParams expectedSearchParams = searchParamsWithRequests(K, partitionSize);
65 FacetsCollector fc = new FacetsCollector(expectedSearchParams, indexReader, taxoReader);
67 searcher.search(q, MultiCollector.wrap(docCollector, fc));
69 List<FacetResult> expectedResults = fc.getFacetResults();
71 // complement with sampling!
72 final Sampler sampler = createSampler(docCollector.getScoredDocIDs());
74 FacetSearchParams samplingSearchParams = searchParamsWithRequests(K, partitionSize);
76 assertSampling(expectedResults, q, sampler, samplingSearchParams, false);
77 assertSampling(expectedResults, q, sampler, samplingSearchParams, true);
83 private void assertSampling(List<FacetResult> expected, Query q, Sampler sampler, FacetSearchParams params, boolean complement) throws Exception {
84 // try several times in case of failure, because the test has a chance to fail
85 // if the top K facets are not sufficiently common with the sample set
86 for (int n=RETRIES; n>0; n--) {
87 FacetsCollector samplingFC = samplingCollector(false, sampler, params);
89 searcher.search(q, samplingFC);
90 List<FacetResult> sampledResults = samplingFC.getFacetResults();
93 assertSameResults(expected, sampledResults);
95 } catch (Exception e) {
96 if (n<=1) { // otherwise try again
103 private FacetsCollector samplingCollector(
104 final boolean complement,
105 final Sampler sampler,
106 FacetSearchParams samplingSearchParams) {
107 FacetsCollector samplingFC = new FacetsCollector(samplingSearchParams, indexReader, taxoReader) {
109 protected FacetsAccumulator initFacetsAccumulator(
110 FacetSearchParams facetSearchParams, IndexReader indexReader,
111 TaxonomyReader taxonomyReader) {
112 FacetsAccumulator acc = getSamplingAccumulator(sampler, taxonomyReader, indexReader, facetSearchParams);
113 acc.setComplementThreshold(complement ? FacetsAccumulator.FORCE_COMPLEMENT : FacetsAccumulator.DISABLE_COMPLEMENT);
120 private Sampler createSampler(ScoredDocIDs scoredDocIDs) {
121 SamplingParams samplingParams = new SamplingParams();
122 samplingParams.setSampleRatio(0.8);
123 samplingParams.setMinSampleSize(100);
124 samplingParams.setMaxSampleSize(10000);
125 samplingParams.setSampingThreshold(11000); //force sampling
126 samplingParams.setOversampleFactor(5.0);
127 Sampler sampler = new Sampler(samplingParams);
128 assertTrue("must enable sampling for this test!",sampler.shouldSample(scoredDocIDs));