lucene-java-3.4.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/util/ScoredDocIdsUtils.java

   1 package org.apache.lucene.facet.util;
   2
   3 import java.io.IOException;
   4 import java.util.Arrays;
   5
   6 import org.apache.lucene.index.IndexReader;
   7 import org.apache.lucene.search.DocIdSet;
   8 import org.apache.lucene.search.DocIdSetIterator;
   9 import org.apache.lucene.util.OpenBitSet;
  10 import org.apache.lucene.util.OpenBitSetDISI;
  11
  12 import org.apache.lucene.facet.search.ScoredDocIDs;
  13 import org.apache.lucene.facet.search.ScoredDocIDsIterator;
  14
  15 /**
  16  * Licensed to the Apache Software Foundation (ASF) under one or more
  17  * contributor license agreements.  See the NOTICE file distributed with
  18  * this work for additional information regarding copyright ownership.
  19  * The ASF licenses this file to You under the Apache License, Version 2.0
  20  * (the "License"); you may not use this file except in compliance with
  21  * the License.  You may obtain a copy of the License at
  22  *
  23  *     http://www.apache.org/licenses/LICENSE-2.0
  24  *
  25  * Unless required by applicable law or agreed to in writing, software
  26  * distributed under the License is distributed on an "AS IS" BASIS,
  27  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  28  * See the License for the specific language governing permissions and
  29  * limitations under the License.
  30  */
  31
  32 /**
  33  * Utility methods for Scored Doc IDs.
  34  *
  35  * @lucene.experimental
  36  */
  37 public class ScoredDocIdsUtils {
  38
  39   /**
  40    * Create a complement of the input set. The returned {@link ScoredDocIDs}
  41    * does not contain any scores, which makes sense given that the complementing
  42    * documents were not scored.
  43    *
  44    * Note: the complement set does NOT contain doc ids which are noted as deleted by the given reader
  45    *
  46    * @param docids to be complemented.
  47    * @param reader holding the number of documents & information about deletions.
  48    */
  49   public final static ScoredDocIDs getComplementSet(final ScoredDocIDs docids, final IndexReader reader)
  50   throws IOException {
  51     final int maxDoc = reader.maxDoc();
  52
  53     DocIdSet docIdSet = docids.getDocIDs();
  54     final OpenBitSet complement;
  55     if (docIdSet instanceof OpenBitSet) {
  56       // That is the most common case, if ScoredDocIdsCollector was used.
  57       complement = (OpenBitSet) ((OpenBitSet) docIdSet).clone();
  58     } else {
  59       complement = new OpenBitSetDISI(docIdSet.iterator(), maxDoc);
  60     }
  61
  62     complement.flip(0, maxDoc);
  63
  64     // Remove all Deletions from the complement set
  65     clearDeleted(reader, complement);
  66
  67     return createScoredDocIds(complement, maxDoc);
  68   }
  69
  70   /**
  71    * Clear all deleted documents from a given open-bit-set according to a given reader
  72    */
  73   private static void clearDeleted(final IndexReader reader,
  74       final OpenBitSet set) throws IOException {
  75
  76     // If there are no deleted docs
  77     if (!reader.hasDeletions()) {
  78       return; // return immediately
  79     }
  80
  81     DocIdSetIterator it = set.iterator();
  82     int doc = DocIdSetIterator.NO_MORE_DOCS;
  83     while ((doc = it.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
  84       if (reader.isDeleted(doc)) {
  85         set.fastClear(doc);
  86       }
  87     }
  88   }
  89
  90   /**
  91    * Create a subset of an existing ScoredDocIDs object.
  92    *
  93    * @param allDocIds orginal set
  94    * @param sampleSet Doc Ids of the subset.
  95    */
  96   public static final ScoredDocIDs createScoredDocIDsSubset(final ScoredDocIDs allDocIds,
  97       final int[] sampleSet) throws IOException {
  98
  99     // sort so that we can scan docs in order
 100     final int[] docids = sampleSet;
 101     Arrays.sort(docids);
 102     final float[] scores = new float[docids.length];
 103     // fetch scores and compute size
 104     ScoredDocIDsIterator it = allDocIds.iterator();
 105     int n = 0;
 106     while (it.next() && n < docids.length) {
 107       int doc = it.getDocID();
 108       if (doc == docids[n]) {
 109         scores[n] = it.getScore();
 110         ++n;
 111       }
 112     }
 113     final int size = n;
 114
 115     return new ScoredDocIDs() {
 116
 117       public DocIdSet getDocIDs() {
 118         return new DocIdSet() {
 119
 120           @Override
 121           public boolean isCacheable() { return true; }
 122
 123           @Override
 124           public DocIdSetIterator iterator() throws IOException {
 125             return new DocIdSetIterator() {
 126
 127               private int next = -1;
 128
 129               @Override
 130               public int advance(int target) throws IOException {
 131                 while (next < size && docids[next++] < target) {
 132                 }
 133                 return next == size ? NO_MORE_DOCS : docids[next];
 134               }
 135
 136               @Override
 137               public int docID() {
 138                 return docids[next];
 139               }
 140
 141               @Override
 142               public int nextDoc() throws IOException {
 143                 if (++next >= size) {
 144                   return NO_MORE_DOCS;
 145                 }
 146                 return docids[next];
 147               }
 148
 149             };
 150           }
 151         };
 152       }
 153
 154       public ScoredDocIDsIterator iterator() throws IOException {
 155         return new ScoredDocIDsIterator() {
 156
 157           int next = -1;
 158
 159           public boolean next() { return ++next < size; }
 160
 161           public float getScore() { return scores[next]; }
 162
 163           public int getDocID() { return docids[next]; }
 164         };
 165       }
 166
 167       public int size() { return size; }
 168
 169     };
 170   }
 171
 172   /**
 173    * Creates a {@link ScoredDocIDs} which returns document IDs all non-deleted doc ids
 174    * according to the given reader.
 175    * The returned set contains the range of [0 .. reader.maxDoc ) doc ids
 176    */
 177   public static final ScoredDocIDs createAllDocsScoredDocIDs (final IndexReader reader) {
 178     if (reader.hasDeletions()) {
 179       return new AllLiveDocsScoredDocIDs(reader);
 180     }
 181     return new AllDocsScoredDocIDs(reader);
 182   }
 183
 184   /**
 185    * Create a ScoredDocIDs out of a given docIdSet and the total number of documents in an index
 186    */
 187   public static final ScoredDocIDs createScoredDocIds(final DocIdSet docIdSet, final int maxDoc) {
 188     return new ScoredDocIDs() {
 189       private int size = -1;
 190       public DocIdSet getDocIDs() { return docIdSet; }
 191
 192       public ScoredDocIDsIterator iterator() throws IOException {
 193         final DocIdSetIterator docIterator = docIdSet.iterator();
 194         return new ScoredDocIDsIterator() {
 195           public boolean next() {
 196             try {
 197               return docIterator.nextDoc() != DocIdSetIterator.NO_MORE_DOCS;
 198             } catch (IOException e) {
 199               throw new RuntimeException(e);
 200             }
 201           }
 202
 203           public float getScore() { return DEFAULT_SCORE; }
 204
 205           public int getDocID() { return docIterator.docID(); }
 206         };
 207       }
 208
 209       public int size() {
 210         // lazy size computation
 211         if (size < 0) {
 212           OpenBitSetDISI openBitSetDISI;
 213           try {
 214             openBitSetDISI = new OpenBitSetDISI(docIdSet.iterator(), maxDoc);
 215           } catch (IOException e) {
 216             throw new RuntimeException(e);
 217           }
 218           size = (int) openBitSetDISI.cardinality();
 219         }
 220         return size;
 221       }
 222     };
 223   }
 224
 225   /**
 226    * All docs ScoredDocsIDs - this one is simply an 'all 1' bitset. Used when
 227    * there are no deletions in the index and we wish to go through each and
 228    * every document
 229    */
 230   private static class AllDocsScoredDocIDs implements ScoredDocIDs {
 231     final int maxDoc;
 232
 233     public AllDocsScoredDocIDs(IndexReader reader) {
 234       this.maxDoc = reader.maxDoc();
 235     }
 236
 237     public int size() {
 238       return maxDoc;
 239     }
 240
 241     public DocIdSet getDocIDs() {
 242       return new DocIdSet() {
 243
 244         @Override
 245         public boolean isCacheable() {
 246           return true;
 247         }
 248
 249         @Override
 250         public DocIdSetIterator iterator() throws IOException {
 251           return new DocIdSetIterator() {
 252             private int next = -1;
 253
 254             @Override
 255             public int advance(int target) throws IOException {
 256               if (target <= next) {
 257                 target = next + 1;
 258               }
 259               return next = target >= maxDoc ? NO_MORE_DOCS
 260                   : target;
 261             }
 262
 263             @Override
 264             public int docID() {
 265               return next;
 266             }
 267
 268             @Override
 269             public int nextDoc() throws IOException {
 270               return ++next < maxDoc ? next : NO_MORE_DOCS;
 271             }
 272
 273           };
 274         }
 275       };
 276     }
 277
 278     public ScoredDocIDsIterator iterator() {
 279       try {
 280         final DocIdSetIterator iter = getDocIDs().iterator();
 281         return new ScoredDocIDsIterator() {
 282           public boolean next() {
 283             try {
 284               return iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS;
 285             } catch (IOException e) {
 286               // cannot happen
 287               return false;
 288             }
 289           }
 290
 291           public float getScore() {
 292             return DEFAULT_SCORE;
 293           }
 294
 295           public int getDocID() {
 296             return iter.docID();
 297           }
 298         };
 299       } catch (IOException e) {
 300         // cannot happen
 301         throw new RuntimeException(e);
 302       }
 303     }
 304   }
 305
 306   /**
 307    * An All-docs bitset which has '0' for deleted documents and '1' for the
 308    * rest. Useful for iterating over all 'live' documents in a given index.
 309    * <p>
 310    * NOTE: this class would work for indexes with no deletions at all,
 311    * although it is recommended to use {@link AllDocsScoredDocIDs} to ease
 312    * the performance cost of validating isDeleted() on each and every docId
 313    */
 314   private static final class AllLiveDocsScoredDocIDs implements ScoredDocIDs {
 315     final int maxDoc;
 316     final IndexReader reader;
 317
 318     AllLiveDocsScoredDocIDs(IndexReader reader) {
 319       this.maxDoc = reader.maxDoc();
 320       this.reader = reader;
 321     }
 322
 323     public int size() {
 324       return reader.numDocs();
 325     }
 326
 327     public DocIdSet getDocIDs() {
 328       return new DocIdSet() {
 329
 330         @Override
 331         public boolean isCacheable() {
 332           return true;
 333         }
 334
 335         @Override
 336         public DocIdSetIterator iterator() throws IOException {
 337           return new DocIdSetIterator() {
 338             private int next = -1;
 339
 340             @Override
 341             public int advance(int target) throws IOException {
 342               if (target > next) {
 343                 next = target - 1;
 344               }
 345               return nextDoc();
 346             }
 347
 348             @Override
 349             public int docID() {
 350               return next;
 351             }
 352
 353             @Override
 354             public int nextDoc() throws IOException {
 355               do {
 356                 ++next;
 357               } while (next < maxDoc && reader.isDeleted(next));
 358
 359               return next < maxDoc ? next : NO_MORE_DOCS;
 360             }
 361
 362           };
 363         }
 364       };
 365     }
 366
 367     public ScoredDocIDsIterator iterator() {
 368       try {
 369         final DocIdSetIterator iter = getDocIDs().iterator();
 370         return new ScoredDocIDsIterator() {
 371           public boolean next() {
 372             try {
 373               return iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS;
 374             } catch (IOException e) {
 375               // cannot happen
 376               return false;
 377             }
 378           }
 379
 380           public float getScore() {
 381             return DEFAULT_SCORE;
 382           }
 383
 384           public int getDocID() {
 385             return iter.docID();
 386           }
 387         };
 388       } catch (IOException e) {
 389         // cannot happen
 390         throw new RuntimeException(e);
 391       }
 392     }
 393   }
 394 }