lucene-java-3.4.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/search/sampling/TakmiSampleFixer.java

   1 package org.apache.lucene.facet.search.sampling;
   2
   3 import java.io.IOException;
   4
   5 import org.apache.lucene.index.IndexReader;
   6 import org.apache.lucene.index.Term;
   7 import org.apache.lucene.index.TermDocs;
   8
   9 import org.apache.lucene.facet.search.DrillDown;
  10 import org.apache.lucene.facet.search.ScoredDocIDs;
  11 import org.apache.lucene.facet.search.ScoredDocIDsIterator;
  12 import org.apache.lucene.facet.search.params.FacetSearchParams;
  13 import org.apache.lucene.facet.search.results.FacetResult;
  14 import org.apache.lucene.facet.search.results.FacetResultNode;
  15 import org.apache.lucene.facet.taxonomy.CategoryPath;
  16 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
  17
  18 /**
  19  * Licensed to the Apache Software Foundation (ASF) under one or more
  20  * contributor license agreements.  See the NOTICE file distributed with
  21  * this work for additional information regarding copyright ownership.
  22  * The ASF licenses this file to You under the Apache License, Version 2.0
  23  * (the "License"); you may not use this file except in compliance with
  24  * the License.  You may obtain a copy of the License at
  25  *
  26  *     http://www.apache.org/licenses/LICENSE-2.0
  27  *
  28  * Unless required by applicable law or agreed to in writing, software
  29  * distributed under the License is distributed on an "AS IS" BASIS,
  30  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  31  * See the License for the specific language governing permissions and
  32  * limitations under the License.
  33  */
  34
  35 /**
  36  * Fix sampling results by counting the intersection between two lists: a
  37  * TermDocs (list of documents in a certain category) and a DocIdSetIterator
  38  * (list of documents matching the query).
  39  *
  40  *
  41  * @lucene.experimental
  42  */
  43 // TODO (Facet): implement also an estimated fixing by ratio (taking into
  44 // account "translation" of counts!)
  45 class TakmiSampleFixer implements SampleFixer {
  46
  47   private TaxonomyReader taxonomyReader;
  48   private IndexReader indexReader;
  49   private FacetSearchParams searchParams;
  50
  51   public TakmiSampleFixer(IndexReader indexReader,
  52       TaxonomyReader taxonomyReader, FacetSearchParams searchParams) {
  53     this.indexReader = indexReader;
  54     this.taxonomyReader = taxonomyReader;
  55     this.searchParams = searchParams;
  56   }
  57
  58   public void fixResult(ScoredDocIDs origDocIds, FacetResult fres)
  59       throws IOException {
  60     FacetResultNode topRes = fres.getFacetResultNode();
  61     fixResultNode(topRes, origDocIds);
  62   }
  63
  64   /**
  65    * Fix result node count, and, recursively, fix all its children
  66    *
  67    * @param facetResNode
  68    *          result node to be fixed
  69    * @param docIds
  70    *          docids in effect
  71    * @throws IOException
  72    */
  73   private void fixResultNode(FacetResultNode facetResNode, ScoredDocIDs docIds)
  74       throws IOException {
  75     recount(facetResNode, docIds);
  76     for (FacetResultNode frn : facetResNode.getSubResults()) {
  77       fixResultNode(frn, docIds);
  78     }
  79   }
  80
  81   /**
  82    * Internal utility: recount for a facet result node
  83    *
  84    * @param fresNode
  85    *          result node to be recounted
  86    * @param docIds
  87    *          full set of matching documents.
  88    * @throws IOException
  89    */
  90   private void recount(FacetResultNode fresNode, ScoredDocIDs docIds)
  91       throws IOException {
  92     // TODO (Facet): change from void to return the new, smaller docSet, and use
  93     // that for the children, as this will make their intersection ops faster.
  94     // can do this only when the new set is "sufficiently" smaller.
  95
  96     /* We need the category's path name in order to do its recounting.
  97      * If it is missing, because the option to label only part of the
  98      * facet results was exercise, we need to calculate them anyway, so
  99      * in essence sampling with recounting spends some extra cycles for
 100      * labeling results for which labels are not required. */
 101     CategoryPath catPath = fresNode.getLabel(taxonomyReader); // force labeling
 102
 103     Term drillDownTerm = DrillDown.term(searchParams, catPath);
 104     int updatedCount = countIntersection(indexReader.termDocs(drillDownTerm),
 105         docIds.iterator());
 106
 107     fresNode.setValue(updatedCount);
 108   }
 109
 110   /**
 111    * Count the size of the intersection between two lists: a TermDocs (list of
 112    * documents in a certain category) and a DocIdSetIterator (list of documents
 113    * matching a query).
 114    */
 115   private static int countIntersection(TermDocs p1, ScoredDocIDsIterator p2)
 116       throws IOException {
 117     // The documentation of of both TermDocs and DocIdSetIterator claim
 118     // that we must do next() before doc(). So we do, and if one of the
 119     // lists is empty, obviously return 0;
 120     if (!p1.next()) {
 121       return 0;
 122     }
 123     if (!p2.next()) {
 124       return 0;
 125     }
 126
 127     int d1 = p1.doc();
 128     int d2 = p2.getDocID();
 129
 130     int count = 0;
 131     for (;;) {
 132       if (d1 == d2) {
 133         ++count;
 134         if (!p1.next()) {
 135           break; // end of list 1, nothing more in intersection
 136         }
 137         d1 = p1.doc();
 138         if (!advance(p2, d1)) {
 139           break; // end of list 2, nothing more in intersection
 140         }
 141         d2 = p2.getDocID();
 142       } else if (d1 < d2) {
 143         if (!p1.skipTo(d2)) {
 144           break; // end of list 1, nothing more in intersection
 145         }
 146         d1 = p1.doc();
 147       } else /* d1>d2 */ {
 148         if (!advance(p2, d1)) {
 149           break; // end of list 2, nothing more in intersection
 150         }
 151         d2 = p2.getDocID();
 152       }
 153     }
 154     return count;
 155   }
 156
 157   /**
 158    * utility: advance the iterator until finding (or exceeding) specific
 159    * document
 160    *
 161    * @param iterator
 162    *          iterator being advanced
 163    * @param targetDoc
 164    *          target of advancing
 165    * @return false if iterator exhausted, true otherwise.
 166    */
 167   private static boolean advance(ScoredDocIDsIterator iterator, int targetDoc) {
 168     while (iterator.next()) {
 169       if (iterator.getDocID() >= targetDoc) {
 170         return true; // target reached
 171       }
 172     }
 173     return false; // exhausted
 174   }
 175 }