lucene-java-3.4.0/lucene/src/java/org/apache/lucene/search/CachingCollector.java

   1 package org.apache.lucene.search;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.util.ArrayList;
  22 import java.util.List;
  23
  24 import org.apache.lucene.index.IndexReader;
  25 import org.apache.lucene.util.RamUsageEstimator;
  26
  27 /**
  28  * Caches all docs, and optionally also scores, coming from
  29  * a search, and is then able to replay them to another
  30  * collector.  You specify the max RAM this class may use.
  31  * Once the collection is done, call {@link #isCached}.  If
  32  * this returns true, you can use {@link #replay} against a
  33  * new collector.  If it returns false, this means too much
  34  * RAM was required and you must instead re-run the original
  35  * search.
  36  *
  37  * <p><b>NOTE</b>: this class consumes 4 (or 8 bytes, if
  38  * scoring is cached) per collected document.  If the result
  39  * set is large this can easily be a very substantial amount
  40  * of RAM!
  41  *
  42  * <p><b>NOTE</b>: this class caches at least 128 documents
  43  * before checking RAM limits.
  44  *
  45  * <p>See the Lucene <tt>contrib/grouping</tt> module for more
  46  * details including a full code example.</p>
  47  *
  48  * @lucene.experimental
  49  */
  50 public abstract class CachingCollector extends Collector {
  51
  52   // Max out at 512K arrays
  53   private static final int MAX_ARRAY_SIZE = 512 * 1024;
  54   private static final int INITIAL_ARRAY_SIZE = 128;
  55   private final static int[] EMPTY_INT_ARRAY = new int[0];
  56
  57   private static class SegStart {
  58     public final IndexReader reader;
  59     public final int base;
  60     public final int end;
  61
  62     public SegStart(IndexReader reader, int base, int end) {
  63       this.reader = reader;
  64       this.base = base;
  65       this.end = end;
  66     }
  67   }
  68
  69   private static final class CachedScorer extends Scorer {
  70
  71     // NOTE: these members are package-private b/c that way accessing them from
  72     // the outer class does not incur access check by the JVM. The same
  73     // situation would be if they were defined in the outer class as private
  74     // members.
  75     int doc;
  76     float score;
  77
  78     private CachedScorer() { super((Weight) null); }
  79
  80     @Override
  81     public final float score() { return score; }
  82
  83     @Override
  84     public final int advance(int target) { throw new UnsupportedOperationException(); }
  85
  86     @Override
  87     public final int docID() { return doc; }
  88
  89     @Override
  90     public final float freq() { throw new UnsupportedOperationException(); }
  91
  92     @Override
  93     public final int nextDoc() { throw new UnsupportedOperationException(); }
  94   }
  95
  96   // A CachingCollector which caches scores
  97   private static final class ScoreCachingCollector extends CachingCollector {
  98
  99     private final CachedScorer cachedScorer;
 100     private final List<float[]> cachedScores;
 101
 102     private Scorer scorer;
 103     private float[] curScores;
 104
 105     ScoreCachingCollector(Collector other, double maxRAMMB) {
 106       super(other, maxRAMMB, true);
 107
 108       cachedScorer = new CachedScorer();
 109       cachedScores = new ArrayList<float[]>();
 110       curScores = new float[128];
 111       cachedScores.add(curScores);
 112     }
 113
 114     ScoreCachingCollector(Collector other, int maxDocsToCache) {
 115       super(other, maxDocsToCache);
 116
 117       cachedScorer = new CachedScorer();
 118       cachedScores = new ArrayList<float[]>();
 119       curScores = new float[INITIAL_ARRAY_SIZE];
 120       cachedScores.add(curScores);
 121     }
 122
 123     @Override
 124     public void collect(int doc) throws IOException {
 125
 126       if (curDocs == null) {
 127         // Cache was too large
 128         cachedScorer.score = scorer.score();
 129         cachedScorer.doc = doc;
 130         other.collect(doc);
 131         return;
 132       }
 133
 134       // Allocate a bigger array or abort caching
 135       if (upto == curDocs.length) {
 136         base += upto;
 137
 138         // Compute next array length - don't allocate too big arrays
 139         int nextLength = 8*curDocs.length;
 140         if (nextLength > MAX_ARRAY_SIZE) {
 141           nextLength = MAX_ARRAY_SIZE;
 142         }
 143
 144         if (base + nextLength > maxDocsToCache) {
 145           // try to allocate a smaller array
 146           nextLength = maxDocsToCache - base;
 147           if (nextLength <= 0) {
 148             // Too many docs to collect -- clear cache
 149             curDocs = null;
 150             curScores = null;
 151             cachedSegs.clear();
 152             cachedDocs.clear();
 153             cachedScores.clear();
 154             cachedScorer.score = scorer.score();
 155             cachedScorer.doc = doc;
 156             other.collect(doc);
 157             return;
 158           }
 159         }
 160
 161         curDocs = new int[nextLength];
 162         cachedDocs.add(curDocs);
 163         curScores = new float[nextLength];
 164         cachedScores.add(curScores);
 165         upto = 0;
 166       }
 167
 168       curDocs[upto] = doc;
 169       cachedScorer.score = curScores[upto] = scorer.score();
 170       upto++;
 171       cachedScorer.doc = doc;
 172       other.collect(doc);
 173     }
 174
 175     @Override
 176     public void replay(Collector other) throws IOException {
 177       replayInit(other);
 178
 179       int curUpto = 0;
 180       int curBase = 0;
 181       int chunkUpto = 0;
 182       curDocs = EMPTY_INT_ARRAY;
 183       for (SegStart seg : cachedSegs) {
 184         other.setNextReader(seg.reader, seg.base);
 185         other.setScorer(cachedScorer);
 186         while (curBase + curUpto < seg.end) {
 187           if (curUpto == curDocs.length) {
 188             curBase += curDocs.length;
 189             curDocs = cachedDocs.get(chunkUpto);
 190             curScores = cachedScores.get(chunkUpto);
 191             chunkUpto++;
 192             curUpto = 0;
 193           }
 194           cachedScorer.score = curScores[curUpto];
 195           cachedScorer.doc = curDocs[curUpto];
 196           other.collect(curDocs[curUpto++]);
 197         }
 198       }
 199     }
 200
 201     @Override
 202     public void setScorer(Scorer scorer) throws IOException {
 203       this.scorer = scorer;
 204       other.setScorer(cachedScorer);
 205     }
 206
 207     @Override
 208     public String toString() {
 209       if (isCached()) {
 210         return "CachingCollector (" + (base+upto) + " docs & scores cached)";
 211       } else {
 212         return "CachingCollector (cache was cleared)";
 213       }
 214     }
 215
 216   }
 217
 218   // A CachingCollector which does not cache scores
 219   private static final class NoScoreCachingCollector extends CachingCollector {
 220
 221     NoScoreCachingCollector(Collector other, double maxRAMMB) {
 222      super(other, maxRAMMB, false);
 223     }
 224
 225     NoScoreCachingCollector(Collector other, int maxDocsToCache) {
 226      super(other, maxDocsToCache);
 227     }
 228
 229     @Override
 230     public void collect(int doc) throws IOException {
 231
 232       if (curDocs == null) {
 233         // Cache was too large
 234         other.collect(doc);
 235         return;
 236       }
 237
 238       // Allocate a bigger array or abort caching
 239       if (upto == curDocs.length) {
 240         base += upto;
 241
 242         // Compute next array length - don't allocate too big arrays
 243         int nextLength = 8*curDocs.length;
 244         if (nextLength > MAX_ARRAY_SIZE) {
 245           nextLength = MAX_ARRAY_SIZE;
 246         }
 247
 248         if (base + nextLength > maxDocsToCache) {
 249           // try to allocate a smaller array
 250           nextLength = maxDocsToCache - base;
 251           if (nextLength <= 0) {
 252             // Too many docs to collect -- clear cache
 253             curDocs = null;
 254             cachedSegs.clear();
 255             cachedDocs.clear();
 256             other.collect(doc);
 257             return;
 258           }
 259         }
 260
 261         curDocs = new int[nextLength];
 262         cachedDocs.add(curDocs);
 263         upto = 0;
 264       }
 265
 266       curDocs[upto] = doc;
 267       upto++;
 268       other.collect(doc);
 269     }
 270
 271     @Override
 272     public void replay(Collector other) throws IOException {
 273       replayInit(other);
 274
 275       int curUpto = 0;
 276       int curbase = 0;
 277       int chunkUpto = 0;
 278       curDocs = EMPTY_INT_ARRAY;
 279       for (SegStart seg : cachedSegs) {
 280         other.setNextReader(seg.reader, seg.base);
 281         while (curbase + curUpto < seg.end) {
 282           if (curUpto == curDocs.length) {
 283             curbase += curDocs.length;
 284             curDocs = cachedDocs.get(chunkUpto);
 285             chunkUpto++;
 286             curUpto = 0;
 287           }
 288           other.collect(curDocs[curUpto++]);
 289         }
 290       }
 291     }
 292
 293     @Override
 294     public void setScorer(Scorer scorer) throws IOException {
 295       other.setScorer(scorer);
 296     }
 297
 298     @Override
 299     public String toString() {
 300       if (isCached()) {
 301         return "CachingCollector (" + (base+upto) + " docs cached)";
 302       } else {
 303         return "CachingCollector (cache was cleared)";
 304       }
 305     }
 306
 307   }
 308
 309   // TODO: would be nice if a collector defined a
 310   // needsScores() method so we can specialize / do checks
 311   // up front. This is only relevant for the ScoreCaching
 312   // version -- if the wrapped Collector does not need
 313   // scores, it can avoid cachedScorer entirely.
 314   protected final Collector other;
 315
 316   protected final int maxDocsToCache;
 317   protected final List<SegStart> cachedSegs = new ArrayList<SegStart>();
 318   protected final List<int[]> cachedDocs;
 319
 320   private IndexReader lastReader;
 321
 322   protected int[] curDocs;
 323   protected int upto;
 324   protected int base;
 325   protected int lastDocBase;
 326
 327   /**
 328    * Creates a {@link CachingCollector} which does not wrap another collector.
 329    * The cached documents and scores can later be {@link #replay(Collector)
 330    * replayed}.
 331    *
 332    * @param acceptDocsOutOfOrder
 333    *          whether documents are allowed to be collected out-of-order
 334    */
 335   public static CachingCollector create(final boolean acceptDocsOutOfOrder, boolean cacheScores, double maxRAMMB) {
 336     Collector other = new Collector() {
 337       @Override
 338       public boolean acceptsDocsOutOfOrder() {
 339         return acceptDocsOutOfOrder;
 340       }
 341
 342       @Override
 343       public void setScorer(Scorer scorer) throws IOException {}
 344
 345       @Override
 346       public void collect(int doc) throws IOException {}
 347
 348       @Override
 349       public void setNextReader(IndexReader reader, int docBase) throws IOException {}
 350
 351     };
 352     return create(other, cacheScores, maxRAMMB);
 353   }
 354
 355   /**
 356    * Create a new {@link CachingCollector} that wraps the given collector and
 357    * caches documents and scores up to the specified RAM threshold.
 358    *
 359    * @param other
 360    *          the Collector to wrap and delegate calls to.
 361    * @param cacheScores
 362    *          whether to cache scores in addition to document IDs. Note that
 363    *          this increases the RAM consumed per doc
 364    * @param maxRAMMB
 365    *          the maximum RAM in MB to consume for caching the documents and
 366    *          scores. If the collector exceeds the threshold, no documents and
 367    *          scores are cached.
 368    */
 369   public static CachingCollector create(Collector other, boolean cacheScores, double maxRAMMB) {
 370     return cacheScores ? new ScoreCachingCollector(other, maxRAMMB) : new NoScoreCachingCollector(other, maxRAMMB);
 371   }
 372
 373   /**
 374    * Create a new {@link CachingCollector} that wraps the given collector and
 375    * caches documents and scores up to the specified max docs threshold.
 376    *
 377    * @param other
 378    *          the Collector to wrap and delegate calls to.
 379    * @param cacheScores
 380    *          whether to cache scores in addition to document IDs. Note that
 381    *          this increases the RAM consumed per doc
 382    * @param maxDocsToCache
 383    *          the maximum number of documents for caching the documents and
 384    *          possible the scores. If the collector exceeds the threshold,
 385    *          no documents and scores are cached.
 386    */
 387   public static CachingCollector create(Collector other, boolean cacheScores, int maxDocsToCache) {
 388     return cacheScores ? new ScoreCachingCollector(other, maxDocsToCache) : new NoScoreCachingCollector(other, maxDocsToCache);
 389   }
 390
 391   // Prevent extension from non-internal classes
 392   private CachingCollector(Collector other, double maxRAMMB, boolean cacheScores) {
 393     this.other = other;
 394
 395     cachedDocs = new ArrayList<int[]>();
 396     curDocs = new int[INITIAL_ARRAY_SIZE];
 397     cachedDocs.add(curDocs);
 398
 399     int bytesPerDoc = RamUsageEstimator.NUM_BYTES_INT;
 400     if (cacheScores) {
 401       bytesPerDoc += RamUsageEstimator.NUM_BYTES_FLOAT;
 402     }
 403     maxDocsToCache = (int) ((maxRAMMB * 1024 * 1024) / bytesPerDoc);
 404   }
 405
 406   private CachingCollector(Collector other, int maxDocsToCache) {
 407     this.other = other;
 408
 409     cachedDocs = new ArrayList<int[]>();
 410     curDocs = new int[INITIAL_ARRAY_SIZE];
 411     cachedDocs.add(curDocs);
 412     this.maxDocsToCache = maxDocsToCache;
 413   }
 414
 415   @Override
 416   public boolean acceptsDocsOutOfOrder() {
 417     return other.acceptsDocsOutOfOrder();
 418   }
 419
 420   public boolean isCached() {
 421     return curDocs != null;
 422   }
 423
 424   @Override
 425   public void setNextReader(IndexReader reader, int docBase) throws IOException {
 426     other.setNextReader(reader, docBase);
 427     if (lastReader != null) {
 428       cachedSegs.add(new SegStart(lastReader, lastDocBase, base + upto));
 429     }
 430     lastDocBase = docBase;
 431     lastReader = reader;
 432   }
 433
 434   /** Reused by the specialized inner classes. */
 435   void replayInit(Collector other) {
 436     if (!isCached()) {
 437       throw new IllegalStateException("cannot replay: cache was cleared because too much RAM was required");
 438     }
 439
 440     if (!other.acceptsDocsOutOfOrder() && this.other.acceptsDocsOutOfOrder()) {
 441       throw new IllegalArgumentException(
 442           "cannot replay: given collector does not support "
 443               + "out-of-order collection, while the wrapped collector does. "
 444               + "Therefore cached documents may be out-of-order.");
 445     }
 446
 447     //System.out.println("CC: replay totHits=" + (upto + base));
 448     if (lastReader != null) {
 449       cachedSegs.add(new SegStart(lastReader, lastDocBase, base+upto));
 450       lastReader = null;
 451     }
 452   }
 453
 454   /**
 455    * Replays the cached doc IDs (and scores) to the given Collector. If this
 456    * instance does not cache scores, then Scorer is not set on
 457    * {@code other.setScorer} as well as scores are not replayed.
 458    *
 459    * @throws IllegalStateException
 460    *           if this collector is not cached (i.e., if the RAM limits were too
 461    *           low for the number of documents + scores to cache).
 462    * @throws IllegalArgumentException
 463    *           if the given Collect's does not support out-of-order collection,
 464    *           while the collector passed to the ctor does.
 465    */
 466   public abstract void replay(Collector other) throws IOException;
 467
 468 }