lucene-java-3.4.0/lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/BlockGroupingCollector.java

   1 package org.apache.lucene.search.grouping;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20
  21 import java.io.IOException;
  22
  23 import org.apache.lucene.index.IndexReader;
  24 import org.apache.lucene.index.IndexWriter;       // javadocs
  25 import org.apache.lucene.search.Collector;
  26 import org.apache.lucene.search.DocIdSetIterator;
  27 import org.apache.lucene.search.FieldComparator;
  28 import org.apache.lucene.search.Filter;
  29 import org.apache.lucene.search.Scorer;
  30 import org.apache.lucene.search.Sort;
  31 import org.apache.lucene.search.SortField;
  32 import org.apache.lucene.search.TopDocs;
  33 import org.apache.lucene.search.TopDocsCollector;
  34 import org.apache.lucene.search.TopFieldCollector;
  35 import org.apache.lucene.search.TopScoreDocCollector;
  36 import org.apache.lucene.search.Weight;
  37 import org.apache.lucene.util.ArrayUtil;
  38 import org.apache.lucene.util.PriorityQueue;
  39
  40 /** BlockGroupingCollector performs grouping with a
  41  *  single pass collector, as long as you are grouping by a
  42  *  doc block field, ie all documents sharing a given group
  43  *  value were indexed as a doc block using the atomic
  44  *  {@link IndexWriter#addDocuments} or {@link
  45  *  IndexWriter#updateDocuments} API.
  46  *
  47  *  <p>This results in faster performance (~25% faster QPS)
  48  *  than the two-pass grouping collectors, with the tradeoff
  49  *  being that the documents in each group must always be
  50  *  indexed as a block.  This collector also fills in
  51  *  TopGroups.totalGroupCount without requiring the separate
  52  *  {@link TermAllGroupsCollector}.  However, this collector does
  53  *  not fill in the groupValue of each group; this field
  54  *  will always be null.
  55  *
  56  *  <p><b>NOTE</b>: this collector makes no effort to verify
  57  *  the docs were in fact indexed as a block, so it's up to
  58  *  you to ensure this was the case.
  59  *
  60  *  <p>See {@link org.apache.lucene.search.grouping} for more
  61  *  details including a full code example.</p>
  62  *
  63  * @lucene.experimental
  64  */
  65
  66 public class BlockGroupingCollector extends Collector {
  67
  68   private int[] pendingSubDocs;
  69   private float[] pendingSubScores;
  70   private int subDocUpto;
  71
  72   private final Sort groupSort;
  73   private final int topNGroups;
  74   private final Filter lastDocPerGroup;
  75
  76   // TODO: specialize into 2 classes, static "create" method:
  77   private final boolean needsScores;
  78
  79   private final FieldComparator[] comparators;
  80   private final int[] reversed;
  81   private final int compIDXEnd;
  82   private int bottomSlot;
  83   private boolean queueFull;
  84   private IndexReader currentReader;
  85
  86   private int topGroupDoc;
  87   private int totalHitCount;
  88   private int totalGroupCount;
  89   private int docBase;
  90   private int groupEndDocID;
  91   private DocIdSetIterator lastDocPerGroupBits;
  92   private Scorer scorer;
  93   private final GroupQueue groupQueue;
  94   private boolean groupCompetes;
  95
  96   private final static class FakeScorer extends Scorer {
  97
  98     float score;
  99     int doc;
 100
 101     public FakeScorer() {
 102       super((Weight) null);
 103     }
 104
 105     @Override
 106     public float score() {
 107       return score;
 108     }
 109
 110     @Override
 111     public int docID() {
 112       return doc;
 113     }
 114
 115     @Override
 116     public int advance(int target) {
 117       throw new UnsupportedOperationException();
 118     }
 119
 120     @Override
 121     public int nextDoc() {
 122       throw new UnsupportedOperationException();
 123     }
 124   }
 125
 126   private static final class OneGroup {
 127     IndexReader reader;
 128     int docBase;
 129     //int groupOrd;
 130     int topGroupDoc;
 131     int[] docs;
 132     float[] scores;
 133     int count;
 134     int comparatorSlot;
 135   }
 136
 137   // Sorts by groupSort.  Not static -- uses comparators, reversed
 138   private final class GroupQueue extends PriorityQueue<OneGroup> {
 139
 140     public GroupQueue(int size) {
 141       initialize(size);
 142     }
 143
 144     @Override
 145     protected boolean lessThan(final OneGroup group1, final OneGroup group2) {
 146
 147       //System.out.println("    ltcheck");
 148       assert group1 != group2;
 149       assert group1.comparatorSlot != group2.comparatorSlot;
 150
 151       final int numComparators = comparators.length;
 152       for (int compIDX=0;compIDX < numComparators; compIDX++) {
 153         final int c = reversed[compIDX] * comparators[compIDX].compare(group1.comparatorSlot, group2.comparatorSlot);
 154         if (c != 0) {
 155           // Short circuit
 156           return c > 0;
 157         }
 158       }
 159
 160       // Break ties by docID; lower docID is always sorted first
 161       return group1.topGroupDoc > group2.topGroupDoc;
 162     }
 163   }
 164
 165   // Called when we transition to another group; if the
 166   // group is competitive we insert into the group queue
 167   private void processGroup() {
 168     totalGroupCount++;
 169     //System.out.println("    processGroup ord=" + lastGroupOrd + " competes=" + groupCompetes + " count=" + subDocUpto + " groupDoc=" + topGroupDoc);
 170     if (groupCompetes) {
 171       if (!queueFull) {
 172         // Startup transient: always add a new OneGroup
 173         final OneGroup og = new OneGroup();
 174         og.count = subDocUpto;
 175         og.topGroupDoc = docBase + topGroupDoc;
 176         og.docs = pendingSubDocs;
 177         pendingSubDocs = new int[10];
 178         if (needsScores) {
 179           og.scores = pendingSubScores;
 180           pendingSubScores = new float[10];
 181         }
 182         og.reader = currentReader;
 183         og.docBase = docBase;
 184         //og.groupOrd = lastGroupOrd;
 185         og.comparatorSlot = bottomSlot;
 186         final OneGroup bottomGroup = groupQueue.add(og);
 187         //System.out.println("      ADD group=" + getGroupString(lastGroupOrd) + " newBottom=" + getGroupString(bottomGroup.groupOrd));
 188         queueFull = groupQueue.size() == topNGroups;
 189         if (queueFull) {
 190           // Queue just became full; now set the real bottom
 191           // in the comparators:
 192           bottomSlot = bottomGroup.comparatorSlot;
 193           //System.out.println("    set bottom=" + bottomSlot);
 194           for (int i = 0; i < comparators.length; i++) {
 195             comparators[i].setBottom(bottomSlot);
 196           }
 197           //System.out.println("     QUEUE FULL");
 198         } else {
 199           // Queue not full yet -- just advance bottomSlot:
 200           bottomSlot = groupQueue.size();
 201         }
 202       } else {
 203         // Replace bottom element in PQ and then updateTop
 204         final OneGroup og = groupQueue.top();
 205         assert og != null;
 206         og.count = subDocUpto;
 207         og.topGroupDoc = docBase + topGroupDoc;
 208         // Swap pending docs
 209         final int[] savDocs = og.docs;
 210         og.docs = pendingSubDocs;
 211         pendingSubDocs = savDocs;
 212         if (needsScores) {
 213           // Swap pending scores
 214           final float[] savScores = og.scores;
 215           og.scores = pendingSubScores;
 216           pendingSubScores = savScores;
 217         }
 218         og.reader = currentReader;
 219         og.docBase = docBase;
 220         //og.groupOrd = lastGroupOrd;
 221         bottomSlot = groupQueue.updateTop().comparatorSlot;
 222
 223         //System.out.println("    set bottom=" + bottomSlot);
 224         for (int i = 0; i < comparators.length; i++) {
 225           comparators[i].setBottom(bottomSlot);
 226         }
 227       }
 228     }
 229     subDocUpto = 0;
 230   }
 231
 232   /**
 233    * Create the single pass collector.
 234    *
 235    *  @param groupSort The {@link Sort} used to sort the
 236    *    groups.  The top sorted document within each group
 237    *    according to groupSort, determines how that group
 238    *    sorts against other groups.  This must be non-null,
 239    *    ie, if you want to groupSort by relevance use
 240    *    Sort.RELEVANCE.
 241    *  @param topNGroups How many top groups to keep.
 242    *  @param needsScores true if the collected documents
 243    *    require scores, either because relevance is included
 244    *    in the withinGroupSort or because you plan to pass true
 245    *    for either getSscores or getMaxScores to {@link
 246    *    #getTopGroups}
 247    *  @param lastDocPerGroup a {@link Filter} that marks the
 248    *    last document in each group.
 249    */
 250   public BlockGroupingCollector(Sort groupSort, int topNGroups, boolean needsScores, Filter lastDocPerGroup) throws IOException {
 251
 252     if (topNGroups < 1) {
 253       throw new IllegalArgumentException("topNGroups must be >= 1 (got " + topNGroups + ")");
 254     }
 255
 256     groupQueue = new GroupQueue(topNGroups);
 257     pendingSubDocs = new int[10];
 258     if (needsScores) {
 259       pendingSubScores = new float[10];
 260     }
 261
 262     this.needsScores = needsScores;
 263     this.lastDocPerGroup = lastDocPerGroup;
 264     // TODO: allow null groupSort to mean "by relevance",
 265     // and specialize it?
 266     this.groupSort = groupSort;
 267
 268     this.topNGroups = topNGroups;
 269
 270     final SortField[] sortFields = groupSort.getSort();
 271     comparators = new FieldComparator[sortFields.length];
 272     compIDXEnd = comparators.length - 1;
 273     reversed = new int[sortFields.length];
 274     for (int i = 0; i < sortFields.length; i++) {
 275       final SortField sortField = sortFields[i];
 276       comparators[i] = sortField.getComparator(topNGroups, i);
 277       reversed[i] = sortField.getReverse() ? -1 : 1;
 278     }
 279   }
 280
 281   // TODO: maybe allow no sort on retrieving groups?  app
 282   // may want to simply process docs in the group itself?
 283   // typically they will be presented as a "single" result
 284   // in the UI?
 285
 286   /** Returns the grouped results.  Returns null if the
 287    *  number of groups collected is <= groupOffset.
 288    *
 289    *  <p><b>NOTE</b>: This collector is unable to compute
 290    *  the groupValue per group so it will always be null.
 291    *  This is normally not a problem, as you can obtain the
 292    *  value just like you obtain other values for each
 293    *  matching document (eg, via stored fields, via
 294    *  FieldCache, etc.)
 295    *
 296    *  @param withinGroupSort The {@link Sort} used to sort
 297    *    documents within each group.  Passing null is
 298    *    allowed, to sort by relevance.
 299    *  @param groupOffset Which group to start from
 300    *  @param withinGroupOffset Which document to start from
 301    *    within each group
 302    *  @param maxDocsPerGroup How many top documents to keep
 303    *     within each group.
 304    *  @param fillSortFields If true then the Comparable
 305    *     values for the sort fields will be set
 306    */
 307   public TopGroups getTopGroups(Sort withinGroupSort, int groupOffset, int withinGroupOffset, int maxDocsPerGroup, boolean fillSortFields) throws IOException {
 308
 309     //if (queueFull) {
 310     //System.out.println("getTopGroups groupOffset=" + groupOffset + " topNGroups=" + topNGroups);
 311     //}
 312     if (subDocUpto != 0) {
 313       processGroup();
 314     }
 315     if (groupOffset >= groupQueue.size()) {
 316       return null;
 317     }
 318     int totalGroupedHitCount = 0;
 319
 320     final FakeScorer fakeScorer = new FakeScorer();
 321
 322     @SuppressWarnings("unchecked")
 323     final GroupDocs<Object>[] groups = new GroupDocs[groupQueue.size() - groupOffset];
 324     for(int downTo=groupQueue.size()-groupOffset-1;downTo>=0;downTo--) {
 325       final OneGroup og = groupQueue.pop();
 326
 327       // At this point we hold all docs w/ in each group,
 328       // unsorted; we now sort them:
 329       final TopDocsCollector collector;
 330       if (withinGroupSort == null) {
 331         // Sort by score
 332         if (!needsScores) {
 333           throw new IllegalArgumentException("cannot sort by relevance within group: needsScores=false");
 334         }
 335         collector = TopScoreDocCollector.create(maxDocsPerGroup, true);
 336       } else {
 337         // Sort by fields
 338         collector = TopFieldCollector.create(withinGroupSort, maxDocsPerGroup, fillSortFields, needsScores, needsScores, true);
 339       }
 340
 341       collector.setScorer(fakeScorer);
 342       collector.setNextReader(og.reader, og.docBase);
 343       for(int docIDX=0;docIDX<og.count;docIDX++) {
 344         final int doc = og.docs[docIDX];
 345         fakeScorer.doc = doc;
 346         if (needsScores) {
 347           fakeScorer.score = og.scores[docIDX];
 348         }
 349         collector.collect(doc);
 350       }
 351       totalGroupedHitCount += og.count;
 352
 353       final Object[] groupSortValues;
 354
 355       if (fillSortFields) {
 356         groupSortValues = new Comparable[comparators.length];
 357         for(int sortFieldIDX=0;sortFieldIDX<comparators.length;sortFieldIDX++) {
 358           groupSortValues[sortFieldIDX] = comparators[sortFieldIDX].value(og.comparatorSlot);
 359         }
 360       } else {
 361         groupSortValues = null;
 362       }
 363
 364       final TopDocs topDocs = collector.topDocs(withinGroupOffset, maxDocsPerGroup);
 365
 366       groups[downTo] = new GroupDocs<Object>(topDocs.getMaxScore(),
 367                                      og.count,
 368                                      topDocs.scoreDocs,
 369                                      null,
 370                                      groupSortValues);
 371     }
 372
 373     /*
 374     while (groupQueue.size() != 0) {
 375       final OneGroup og = groupQueue.pop();
 376       //System.out.println("  leftover: og ord=" + og.groupOrd + " count=" + og.count);
 377       totalGroupedHitCount += og.count;
 378     }
 379     */
 380
 381     return new TopGroups<Object>(new TopGroups<Object>(groupSort.getSort(),
 382                                        withinGroupSort == null ? null : withinGroupSort.getSort(),
 383                                        totalHitCount, totalGroupedHitCount, groups),
 384                          totalGroupCount);
 385   }
 386
 387   @Override
 388   public void setScorer(Scorer scorer) throws IOException {
 389     this.scorer = scorer;
 390     for (FieldComparator comparator : comparators) {
 391       comparator.setScorer(scorer);
 392     }
 393   }
 394
 395   @Override
 396   public void collect(int doc) throws IOException {
 397
 398     // System.out.println("C " + doc);
 399
 400     if (doc > groupEndDocID) {
 401       // Group changed
 402       if (subDocUpto != 0) {
 403         processGroup();
 404       }
 405       groupEndDocID = lastDocPerGroupBits.advance(doc);
 406       //System.out.println("  adv " + groupEndDocID + " " + lastDocPerGroupBits);
 407       subDocUpto = 0;
 408       groupCompetes = !queueFull;
 409     }
 410
 411     totalHitCount++;
 412
 413     // Always cache doc/score within this group:
 414     if (subDocUpto == pendingSubDocs.length) {
 415       pendingSubDocs = ArrayUtil.grow(pendingSubDocs);
 416     }
 417     pendingSubDocs[subDocUpto] = doc;
 418     if (needsScores) {
 419       if (subDocUpto == pendingSubScores.length) {
 420         pendingSubScores = ArrayUtil.grow(pendingSubScores);
 421       }
 422       pendingSubScores[subDocUpto] = scorer.score();
 423     }
 424     subDocUpto++;
 425
 426     if (groupCompetes) {
 427       if (subDocUpto == 1) {
 428         assert !queueFull;
 429
 430         //System.out.println("    init copy to bottomSlot=" + bottomSlot);
 431         for (FieldComparator fc : comparators) {
 432           fc.copy(bottomSlot, doc);
 433           fc.setBottom(bottomSlot);
 434         }
 435         topGroupDoc = doc;
 436       } else {
 437         // Compare to bottomSlot
 438         for (int compIDX = 0;; compIDX++) {
 439           final int c = reversed[compIDX] * comparators[compIDX].compareBottom(doc);
 440           if (c < 0) {
 441             // Definitely not competitive -- done
 442             return;
 443           } else if (c > 0) {
 444             // Definitely competitive.
 445             break;
 446           } else if (compIDX == compIDXEnd) {
 447             // Ties with bottom, except we know this docID is
 448             // > docID in the queue (docs are visited in
 449             // order), so not competitive:
 450             return;
 451           }
 452         }
 453
 454         //System.out.println("       best w/in group!");
 455
 456         for (FieldComparator fc : comparators) {
 457           fc.copy(bottomSlot, doc);
 458           // Necessary because some comparators cache
 459           // details of bottom slot; this forces them to
 460           // re-cache:
 461           fc.setBottom(bottomSlot);
 462         }
 463         topGroupDoc = doc;
 464       }
 465     } else {
 466       // We're not sure this group will make it into the
 467       // queue yet
 468       for (int compIDX = 0;; compIDX++) {
 469         final int c = reversed[compIDX] * comparators[compIDX].compareBottom(doc);
 470         if (c < 0) {
 471           // Definitely not competitive -- done
 472           //System.out.println("    doc doesn't compete w/ top groups");
 473           return;
 474         } else if (c > 0) {
 475           // Definitely competitive.
 476           break;
 477         } else if (compIDX == compIDXEnd) {
 478           // Ties with bottom, except we know this docID is
 479           // > docID in the queue (docs are visited in
 480           // order), so not competitive:
 481           //System.out.println("    doc doesn't compete w/ top groups");
 482           return;
 483         }
 484       }
 485       groupCompetes = true;
 486       for (FieldComparator fc : comparators) {
 487         fc.copy(bottomSlot, doc);
 488         // Necessary because some comparators cache
 489         // details of bottom slot; this forces them to
 490         // re-cache:
 491         fc.setBottom(bottomSlot);
 492       }
 493       topGroupDoc = doc;
 494       //System.out.println("        doc competes w/ top groups");
 495     }
 496   }
 497
 498   @Override
 499   public boolean acceptsDocsOutOfOrder() {
 500     return false;
 501   }
 502
 503   @Override
 504   public void setNextReader(IndexReader reader, int docBase) throws IOException {
 505     if (subDocUpto != 0) {
 506       processGroup();
 507     }
 508     subDocUpto = 0;
 509     this.docBase = docBase;
 510     //System.out.println("setNextReader base=" + docBase + " r=" + readerContext.reader);
 511     lastDocPerGroupBits = lastDocPerGroup.getDocIdSet(reader).iterator();
 512     groupEndDocID = -1;
 513
 514     currentReader = reader;
 515     for (int i=0; i<comparators.length; i++) {
 516       comparators[i].setNextReader(reader, docBase);
 517     }
 518   }
 519 }