lucene-java-3.4.0/lucene/src/java/org/apache/lucene/search/Collector.java

   1 package org.apache.lucene.search;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21
  22 import org.apache.lucene.index.IndexReader;
  23
  24 /**
  25  * <p>Expert: Collectors are primarily meant to be used to
  26  * gather raw results from a search, and implement sorting
  27  * or custom result filtering, collation, etc. </p>
  28  *
  29  * <p>Lucene's core collectors are derived from Collector.
  30  * Likely your application can use one of these classes, or
  31  * subclass {@link TopDocsCollector}, instead of
  32  * implementing Collector directly:
  33  *
  34  * <ul>
  35  *
  36  *   <li>{@link TopDocsCollector} is an abstract base class
  37  *   that assumes you will retrieve the top N docs,
  38  *   according to some criteria, after collection is
  39  *   done.  </li>
  40  *
  41  *   <li>{@link TopScoreDocCollector} is a concrete subclass
  42  *   {@link TopDocsCollector} and sorts according to score +
  43  *   docID.  This is used internally by the {@link
  44  *   IndexSearcher} search methods that do not take an
  45  *   explicit {@link Sort}. It is likely the most frequently
  46  *   used collector.</li>
  47  *
  48  *   <li>{@link TopFieldCollector} subclasses {@link
  49  *   TopDocsCollector} and sorts according to a specified
  50  *   {@link Sort} object (sort by field).  This is used
  51  *   internally by the {@link IndexSearcher} search methods
  52  *   that take an explicit {@link Sort}.
  53  *
  54  *   <li>{@link TimeLimitingCollector}, which wraps any other
  55  *   Collector and aborts the search if it's taken too much
  56  *   time.</li>
  57  *
  58  *   <li>{@link PositiveScoresOnlyCollector} wraps any other
  59  *   Collector and prevents collection of hits whose score
  60  *   is &lt;= 0.0</li>
  61  *
  62  * </ul>
  63  *
  64  * <p>Collector decouples the score from the collected doc:
  65  * the score computation is skipped entirely if it's not
  66  * needed.  Collectors that do need the score should
  67  * implement the {@link #setScorer} method, to hold onto the
  68  * passed {@link Scorer} instance, and call {@link
  69  * Scorer#score()} within the collect method to compute the
  70  * current hit's score.  If your collector may request the
  71  * score for a single hit multiple times, you should use
  72  * {@link ScoreCachingWrappingScorer}. </p>
  73  *
  74  * <p><b>NOTE:</b> The doc that is passed to the collect
  75  * method is relative to the current reader. If your
  76  * collector needs to resolve this to the docID space of the
  77  * Multi*Reader, you must re-base it by recording the
  78  * docBase from the most recent setNextReader call.  Here's
  79  * a simple example showing how to collect docIDs into a
  80  * BitSet:</p>
  81  *
  82  * <pre>
  83  * Searcher searcher = new IndexSearcher(indexReader);
  84  * final BitSet bits = new BitSet(indexReader.maxDoc());
  85  * searcher.search(query, new Collector() {
  86  *   private int docBase;
  87  *
  88  *   <em>// ignore scorer</em>
  89  *   public void setScorer(Scorer scorer) {
  90  *   }
  91  *
  92  *   <em>// accept docs out of order (for a BitSet it doesn't matter)</em>
  93  *   public boolean acceptsDocsOutOfOrder() {
  94  *     return true;
  95  *   }
  96  *
  97  *   public void collect(int doc) {
  98  *     bits.set(doc + docBase);
  99  *   }
 100  *
 101  *   public void setNextReader(IndexReader reader, int docBase) {
 102  *     this.docBase = docBase;
 103  *   }
 104  * });
 105  * </pre>
 106  *
 107  * <p>Not all collectors will need to rebase the docID.  For
 108  * example, a collector that simply counts the total number
 109  * of hits would skip it.</p>
 110  *
 111  * <p><b>NOTE:</b> Prior to 2.9, Lucene silently filtered
 112  * out hits with score <= 0.  As of 2.9, the core Collectors
 113  * no longer do that.  It's very unusual to have such hits
 114  * (a negative query boost, or function query returning
 115  * negative custom scores, could cause it to happen).  If
 116  * you need that behavior, use {@link
 117  * PositiveScoresOnlyCollector}.</p>
 118  *
 119  * @lucene.experimental
 120  *
 121  * @since 2.9
 122  */
 123 public abstract class Collector {
 124
 125   /**
 126    * Called before successive calls to {@link #collect(int)}. Implementations
 127    * that need the score of the current document (passed-in to
 128    * {@link #collect(int)}), should save the passed-in Scorer and call
 129    * scorer.score() when needed.
 130    */
 131   public abstract void setScorer(Scorer scorer) throws IOException;
 132
 133   /**
 134    * Called once for every document matching a query, with the unbased document
 135    * number.
 136    *
 137    * <p>
 138    * Note: This is called in an inner search loop. For good search performance,
 139    * implementations of this method should not call {@link Searcher#doc(int)} or
 140    * {@link org.apache.lucene.index.IndexReader#document(int)} on every hit.
 141    * Doing so can slow searches by an order of magnitude or more.
 142    */
 143   public abstract void collect(int doc) throws IOException;
 144
 145   /**
 146    * Called before collecting from each IndexReader. All doc ids in
 147    * {@link #collect(int)} will correspond to reader.
 148    *
 149    * Add docBase to the current IndexReaders internal document id to re-base ids
 150    * in {@link #collect(int)}.
 151    *
 152    * @param reader
 153    *          next IndexReader
 154    * @param docBase
 155    */
 156   public abstract void setNextReader(IndexReader reader, int docBase) throws IOException;
 157
 158   /**
 159    * Return <code>true</code> if this collector does not
 160    * require the matching docIDs to be delivered in int sort
 161    * order (smallest to largest) to {@link #collect}.
 162    *
 163    * <p> Most Lucene Query implementations will visit
 164    * matching docIDs in order.  However, some queries
 165    * (currently limited to certain cases of {@link
 166    * BooleanQuery}) can achieve faster searching if the
 167    * <code>Collector</code> allows them to deliver the
 168    * docIDs out of order.</p>
 169    *
 170    * <p> Many collectors don't mind getting docIDs out of
 171    * order, so it's important to return <code>true</code>
 172    * here.
 173    */
 174   public abstract boolean acceptsDocsOutOfOrder();
 175
 176 }