X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/search/sampling/TakmiSampleFixer.java diff --git a/lucene-java-3.5.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/search/sampling/TakmiSampleFixer.java b/lucene-java-3.5.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/search/sampling/TakmiSampleFixer.java new file mode 100644 index 0000000..015a7f0 --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/search/sampling/TakmiSampleFixer.java @@ -0,0 +1,175 @@ +package org.apache.lucene.facet.search.sampling; + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermDocs; + +import org.apache.lucene.facet.search.DrillDown; +import org.apache.lucene.facet.search.ScoredDocIDs; +import org.apache.lucene.facet.search.ScoredDocIDsIterator; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.search.results.FacetResultNode; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Fix sampling results by counting the intersection between two lists: a + * TermDocs (list of documents in a certain category) and a DocIdSetIterator + * (list of documents matching the query). + * + * + * @lucene.experimental + */ +// TODO (Facet): implement also an estimated fixing by ratio (taking into +// account "translation" of counts!) +class TakmiSampleFixer implements SampleFixer { + + private TaxonomyReader taxonomyReader; + private IndexReader indexReader; + private FacetSearchParams searchParams; + + public TakmiSampleFixer(IndexReader indexReader, + TaxonomyReader taxonomyReader, FacetSearchParams searchParams) { + this.indexReader = indexReader; + this.taxonomyReader = taxonomyReader; + this.searchParams = searchParams; + } + + public void fixResult(ScoredDocIDs origDocIds, FacetResult fres) + throws IOException { + FacetResultNode topRes = fres.getFacetResultNode(); + fixResultNode(topRes, origDocIds); + } + + /** + * Fix result node count, and, recursively, fix all its children + * + * @param facetResNode + * result node to be fixed + * @param docIds + * docids in effect + * @throws IOException + */ + private void fixResultNode(FacetResultNode facetResNode, ScoredDocIDs docIds) + throws IOException { + recount(facetResNode, docIds); + for (FacetResultNode frn : facetResNode.getSubResults()) { + fixResultNode(frn, docIds); + } + } + + /** + * Internal utility: recount for a facet result node + * + * @param fresNode + * result node to be recounted + * @param docIds + * full set of matching documents. + * @throws IOException + */ + private void recount(FacetResultNode fresNode, ScoredDocIDs docIds) + throws IOException { + // TODO (Facet): change from void to return the new, smaller docSet, and use + // that for the children, as this will make their intersection ops faster. + // can do this only when the new set is "sufficiently" smaller. + + /* We need the category's path name in order to do its recounting. + * If it is missing, because the option to label only part of the + * facet results was exercise, we need to calculate them anyway, so + * in essence sampling with recounting spends some extra cycles for + * labeling results for which labels are not required. */ + CategoryPath catPath = fresNode.getLabel(taxonomyReader); // force labeling + + Term drillDownTerm = DrillDown.term(searchParams, catPath); + int updatedCount = countIntersection(indexReader.termDocs(drillDownTerm), + docIds.iterator()); + + fresNode.setValue(updatedCount); + } + + /** + * Count the size of the intersection between two lists: a TermDocs (list of + * documents in a certain category) and a DocIdSetIterator (list of documents + * matching a query). + */ + private static int countIntersection(TermDocs p1, ScoredDocIDsIterator p2) + throws IOException { + // The documentation of of both TermDocs and DocIdSetIterator claim + // that we must do next() before doc(). So we do, and if one of the + // lists is empty, obviously return 0; + if (!p1.next()) { + return 0; + } + if (!p2.next()) { + return 0; + } + + int d1 = p1.doc(); + int d2 = p2.getDocID(); + + int count = 0; + for (;;) { + if (d1 == d2) { + ++count; + if (!p1.next()) { + break; // end of list 1, nothing more in intersection + } + d1 = p1.doc(); + if (!advance(p2, d1)) { + break; // end of list 2, nothing more in intersection + } + d2 = p2.getDocID(); + } else if (d1 < d2) { + if (!p1.skipTo(d2)) { + break; // end of list 1, nothing more in intersection + } + d1 = p1.doc(); + } else /* d1>d2 */ { + if (!advance(p2, d1)) { + break; // end of list 2, nothing more in intersection + } + d2 = p2.getDocID(); + } + } + return count; + } + + /** + * utility: advance the iterator until finding (or exceeding) specific + * document + * + * @param iterator + * iterator being advanced + * @param targetDoc + * target of advancing + * @return false if iterator exhausted, true otherwise. + */ + private static boolean advance(ScoredDocIDsIterator iterator, int targetDoc) { + while (iterator.next()) { + if (iterator.getDocID() >= targetDoc) { + return true; // target reached + } + } + return false; // exhausted + } +} \ No newline at end of file