+++ /dev/null
-package org.apache.lucene.facet.search.sampling;
-
-import java.io.IOException;
-
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermDocs;
-
-import org.apache.lucene.facet.search.DrillDown;
-import org.apache.lucene.facet.search.ScoredDocIDs;
-import org.apache.lucene.facet.search.ScoredDocIDsIterator;
-import org.apache.lucene.facet.search.params.FacetSearchParams;
-import org.apache.lucene.facet.search.results.FacetResult;
-import org.apache.lucene.facet.search.results.FacetResultNode;
-import org.apache.lucene.facet.taxonomy.CategoryPath;
-import org.apache.lucene.facet.taxonomy.TaxonomyReader;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Fix sampling results by counting the intersection between two lists: a
- * TermDocs (list of documents in a certain category) and a DocIdSetIterator
- * (list of documents matching the query).
- *
- *
- * @lucene.experimental
- */
-// TODO (Facet): implement also an estimated fixing by ratio (taking into
-// account "translation" of counts!)
-class TakmiSampleFixer implements SampleFixer {
-
- private TaxonomyReader taxonomyReader;
- private IndexReader indexReader;
- private FacetSearchParams searchParams;
-
- public TakmiSampleFixer(IndexReader indexReader,
- TaxonomyReader taxonomyReader, FacetSearchParams searchParams) {
- this.indexReader = indexReader;
- this.taxonomyReader = taxonomyReader;
- this.searchParams = searchParams;
- }
-
- public void fixResult(ScoredDocIDs origDocIds, FacetResult fres)
- throws IOException {
- FacetResultNode topRes = fres.getFacetResultNode();
- fixResultNode(topRes, origDocIds);
- }
-
- /**
- * Fix result node count, and, recursively, fix all its children
- *
- * @param facetResNode
- * result node to be fixed
- * @param docIds
- * docids in effect
- * @throws IOException
- */
- private void fixResultNode(FacetResultNode facetResNode, ScoredDocIDs docIds)
- throws IOException {
- recount(facetResNode, docIds);
- for (FacetResultNode frn : facetResNode.getSubResults()) {
- fixResultNode(frn, docIds);
- }
- }
-
- /**
- * Internal utility: recount for a facet result node
- *
- * @param fresNode
- * result node to be recounted
- * @param docIds
- * full set of matching documents.
- * @throws IOException
- */
- private void recount(FacetResultNode fresNode, ScoredDocIDs docIds)
- throws IOException {
- // TODO (Facet): change from void to return the new, smaller docSet, and use
- // that for the children, as this will make their intersection ops faster.
- // can do this only when the new set is "sufficiently" smaller.
-
- /* We need the category's path name in order to do its recounting.
- * If it is missing, because the option to label only part of the
- * facet results was exercise, we need to calculate them anyway, so
- * in essence sampling with recounting spends some extra cycles for
- * labeling results for which labels are not required. */
- CategoryPath catPath = fresNode.getLabel(taxonomyReader); // force labeling
-
- Term drillDownTerm = DrillDown.term(searchParams, catPath);
- int updatedCount = countIntersection(indexReader.termDocs(drillDownTerm),
- docIds.iterator());
-
- fresNode.setValue(updatedCount);
- }
-
- /**
- * Count the size of the intersection between two lists: a TermDocs (list of
- * documents in a certain category) and a DocIdSetIterator (list of documents
- * matching a query).
- */
- private static int countIntersection(TermDocs p1, ScoredDocIDsIterator p2)
- throws IOException {
- // The documentation of of both TermDocs and DocIdSetIterator claim
- // that we must do next() before doc(). So we do, and if one of the
- // lists is empty, obviously return 0;
- if (!p1.next()) {
- return 0;
- }
- if (!p2.next()) {
- return 0;
- }
-
- int d1 = p1.doc();
- int d2 = p2.getDocID();
-
- int count = 0;
- for (;;) {
- if (d1 == d2) {
- ++count;
- if (!p1.next()) {
- break; // end of list 1, nothing more in intersection
- }
- d1 = p1.doc();
- if (!advance(p2, d1)) {
- break; // end of list 2, nothing more in intersection
- }
- d2 = p2.getDocID();
- } else if (d1 < d2) {
- if (!p1.skipTo(d2)) {
- break; // end of list 1, nothing more in intersection
- }
- d1 = p1.doc();
- } else /* d1>d2 */ {
- if (!advance(p2, d1)) {
- break; // end of list 2, nothing more in intersection
- }
- d2 = p2.getDocID();
- }
- }
- return count;
- }
-
- /**
- * utility: advance the iterator until finding (or exceeding) specific
- * document
- *
- * @param iterator
- * iterator being advanced
- * @param targetDoc
- * target of advancing
- * @return false if iterator exhausted, true otherwise.
- */
- private static boolean advance(ScoredDocIDsIterator iterator, int targetDoc) {
- while (iterator.next()) {
- if (iterator.getDocID() >= targetDoc) {
- return true; // target reached
- }
- }
- return false; // exhausted
- }
-}
\ No newline at end of file