1 package org.apache.lucene.search.grouping;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 import java.io.IOException;
23 import org.apache.lucene.index.IndexReader;
24 import org.apache.lucene.index.IndexWriter; // javadocs
25 import org.apache.lucene.search.Collector;
26 import org.apache.lucene.search.DocIdSetIterator;
27 import org.apache.lucene.search.FieldComparator;
28 import org.apache.lucene.search.Filter;
29 import org.apache.lucene.search.Scorer;
30 import org.apache.lucene.search.Sort;
31 import org.apache.lucene.search.SortField;
32 import org.apache.lucene.search.TopDocs;
33 import org.apache.lucene.search.TopDocsCollector;
34 import org.apache.lucene.search.TopFieldCollector;
35 import org.apache.lucene.search.TopScoreDocCollector;
36 import org.apache.lucene.search.Weight;
37 import org.apache.lucene.util.ArrayUtil;
38 import org.apache.lucene.util.PriorityQueue;
40 /** BlockGroupingCollector performs grouping with a
41 * single pass collector, as long as you are grouping by a
42 * doc block field, ie all documents sharing a given group
43 * value were indexed as a doc block using the atomic
44 * {@link IndexWriter#addDocuments} or {@link
45 * IndexWriter#updateDocuments} API.
47 * <p>This results in faster performance (~25% faster QPS)
48 * than the two-pass grouping collectors, with the tradeoff
49 * being that the documents in each group must always be
50 * indexed as a block. This collector also fills in
51 * TopGroups.totalGroupCount without requiring the separate
52 * {@link TermAllGroupsCollector}. However, this collector does
53 * not fill in the groupValue of each group; this field
54 * will always be null.
56 * <p><b>NOTE</b>: this collector makes no effort to verify
57 * the docs were in fact indexed as a block, so it's up to
58 * you to ensure this was the case.
60 * <p>See {@link org.apache.lucene.search.grouping} for more
61 * details including a full code example.</p>
63 * @lucene.experimental
66 public class BlockGroupingCollector extends Collector {
68 private int[] pendingSubDocs;
69 private float[] pendingSubScores;
70 private int subDocUpto;
72 private final Sort groupSort;
73 private final int topNGroups;
74 private final Filter lastDocPerGroup;
76 // TODO: specialize into 2 classes, static "create" method:
77 private final boolean needsScores;
79 private final FieldComparator[] comparators;
80 private final int[] reversed;
81 private final int compIDXEnd;
82 private int bottomSlot;
83 private boolean queueFull;
84 private IndexReader currentReader;
86 private int topGroupDoc;
87 private int totalHitCount;
88 private int totalGroupCount;
90 private int groupEndDocID;
91 private DocIdSetIterator lastDocPerGroupBits;
92 private Scorer scorer;
93 private final GroupQueue groupQueue;
94 private boolean groupCompetes;
96 private final static class FakeScorer extends Scorer {
101 public FakeScorer() {
102 super((Weight) null);
106 public float score() {
116 public int advance(int target) {
117 throw new UnsupportedOperationException();
121 public int nextDoc() {
122 throw new UnsupportedOperationException();
126 private static final class OneGroup {
137 // Sorts by groupSort. Not static -- uses comparators, reversed
138 private final class GroupQueue extends PriorityQueue<OneGroup> {
140 public GroupQueue(int size) {
145 protected boolean lessThan(final OneGroup group1, final OneGroup group2) {
147 //System.out.println(" ltcheck");
148 assert group1 != group2;
149 assert group1.comparatorSlot != group2.comparatorSlot;
151 final int numComparators = comparators.length;
152 for (int compIDX=0;compIDX < numComparators; compIDX++) {
153 final int c = reversed[compIDX] * comparators[compIDX].compare(group1.comparatorSlot, group2.comparatorSlot);
160 // Break ties by docID; lower docID is always sorted first
161 return group1.topGroupDoc > group2.topGroupDoc;
165 // Called when we transition to another group; if the
166 // group is competitive we insert into the group queue
167 private void processGroup() {
169 //System.out.println(" processGroup ord=" + lastGroupOrd + " competes=" + groupCompetes + " count=" + subDocUpto + " groupDoc=" + topGroupDoc);
172 // Startup transient: always add a new OneGroup
173 final OneGroup og = new OneGroup();
174 og.count = subDocUpto;
175 og.topGroupDoc = docBase + topGroupDoc;
176 og.docs = pendingSubDocs;
177 pendingSubDocs = new int[10];
179 og.scores = pendingSubScores;
180 pendingSubScores = new float[10];
182 og.reader = currentReader;
183 og.docBase = docBase;
184 //og.groupOrd = lastGroupOrd;
185 og.comparatorSlot = bottomSlot;
186 final OneGroup bottomGroup = groupQueue.add(og);
187 //System.out.println(" ADD group=" + getGroupString(lastGroupOrd) + " newBottom=" + getGroupString(bottomGroup.groupOrd));
188 queueFull = groupQueue.size() == topNGroups;
190 // Queue just became full; now set the real bottom
191 // in the comparators:
192 bottomSlot = bottomGroup.comparatorSlot;
193 //System.out.println(" set bottom=" + bottomSlot);
194 for (int i = 0; i < comparators.length; i++) {
195 comparators[i].setBottom(bottomSlot);
197 //System.out.println(" QUEUE FULL");
199 // Queue not full yet -- just advance bottomSlot:
200 bottomSlot = groupQueue.size();
203 // Replace bottom element in PQ and then updateTop
204 final OneGroup og = groupQueue.top();
206 og.count = subDocUpto;
207 og.topGroupDoc = docBase + topGroupDoc;
209 final int[] savDocs = og.docs;
210 og.docs = pendingSubDocs;
211 pendingSubDocs = savDocs;
213 // Swap pending scores
214 final float[] savScores = og.scores;
215 og.scores = pendingSubScores;
216 pendingSubScores = savScores;
218 og.reader = currentReader;
219 og.docBase = docBase;
220 //og.groupOrd = lastGroupOrd;
221 bottomSlot = groupQueue.updateTop().comparatorSlot;
223 //System.out.println(" set bottom=" + bottomSlot);
224 for (int i = 0; i < comparators.length; i++) {
225 comparators[i].setBottom(bottomSlot);
233 * Create the single pass collector.
235 * @param groupSort The {@link Sort} used to sort the
236 * groups. The top sorted document within each group
237 * according to groupSort, determines how that group
238 * sorts against other groups. This must be non-null,
239 * ie, if you want to groupSort by relevance use
241 * @param topNGroups How many top groups to keep.
242 * @param needsScores true if the collected documents
243 * require scores, either because relevance is included
244 * in the withinGroupSort or because you plan to pass true
245 * for either getSscores or getMaxScores to {@link
247 * @param lastDocPerGroup a {@link Filter} that marks the
248 * last document in each group.
250 public BlockGroupingCollector(Sort groupSort, int topNGroups, boolean needsScores, Filter lastDocPerGroup) throws IOException {
252 if (topNGroups < 1) {
253 throw new IllegalArgumentException("topNGroups must be >= 1 (got " + topNGroups + ")");
256 groupQueue = new GroupQueue(topNGroups);
257 pendingSubDocs = new int[10];
259 pendingSubScores = new float[10];
262 this.needsScores = needsScores;
263 this.lastDocPerGroup = lastDocPerGroup;
264 // TODO: allow null groupSort to mean "by relevance",
265 // and specialize it?
266 this.groupSort = groupSort;
268 this.topNGroups = topNGroups;
270 final SortField[] sortFields = groupSort.getSort();
271 comparators = new FieldComparator[sortFields.length];
272 compIDXEnd = comparators.length - 1;
273 reversed = new int[sortFields.length];
274 for (int i = 0; i < sortFields.length; i++) {
275 final SortField sortField = sortFields[i];
276 comparators[i] = sortField.getComparator(topNGroups, i);
277 reversed[i] = sortField.getReverse() ? -1 : 1;
281 // TODO: maybe allow no sort on retrieving groups? app
282 // may want to simply process docs in the group itself?
283 // typically they will be presented as a "single" result
286 /** Returns the grouped results. Returns null if the
287 * number of groups collected is <= groupOffset.
289 * <p><b>NOTE</b>: This collector is unable to compute
290 * the groupValue per group so it will always be null.
291 * This is normally not a problem, as you can obtain the
292 * value just like you obtain other values for each
293 * matching document (eg, via stored fields, via
296 * @param withinGroupSort The {@link Sort} used to sort
297 * documents within each group. Passing null is
298 * allowed, to sort by relevance.
299 * @param groupOffset Which group to start from
300 * @param withinGroupOffset Which document to start from
302 * @param maxDocsPerGroup How many top documents to keep
304 * @param fillSortFields If true then the Comparable
305 * values for the sort fields will be set
307 public TopGroups getTopGroups(Sort withinGroupSort, int groupOffset, int withinGroupOffset, int maxDocsPerGroup, boolean fillSortFields) throws IOException {
310 //System.out.println("getTopGroups groupOffset=" + groupOffset + " topNGroups=" + topNGroups);
312 if (subDocUpto != 0) {
315 if (groupOffset >= groupQueue.size()) {
318 int totalGroupedHitCount = 0;
320 final FakeScorer fakeScorer = new FakeScorer();
322 @SuppressWarnings("unchecked")
323 final GroupDocs<Object>[] groups = new GroupDocs[groupQueue.size() - groupOffset];
324 for(int downTo=groupQueue.size()-groupOffset-1;downTo>=0;downTo--) {
325 final OneGroup og = groupQueue.pop();
327 // At this point we hold all docs w/ in each group,
328 // unsorted; we now sort them:
329 final TopDocsCollector collector;
330 if (withinGroupSort == null) {
333 throw new IllegalArgumentException("cannot sort by relevance within group: needsScores=false");
335 collector = TopScoreDocCollector.create(maxDocsPerGroup, true);
338 collector = TopFieldCollector.create(withinGroupSort, maxDocsPerGroup, fillSortFields, needsScores, needsScores, true);
341 collector.setScorer(fakeScorer);
342 collector.setNextReader(og.reader, og.docBase);
343 for(int docIDX=0;docIDX<og.count;docIDX++) {
344 final int doc = og.docs[docIDX];
345 fakeScorer.doc = doc;
347 fakeScorer.score = og.scores[docIDX];
349 collector.collect(doc);
351 totalGroupedHitCount += og.count;
353 final Object[] groupSortValues;
355 if (fillSortFields) {
356 groupSortValues = new Comparable[comparators.length];
357 for(int sortFieldIDX=0;sortFieldIDX<comparators.length;sortFieldIDX++) {
358 groupSortValues[sortFieldIDX] = comparators[sortFieldIDX].value(og.comparatorSlot);
361 groupSortValues = null;
364 final TopDocs topDocs = collector.topDocs(withinGroupOffset, maxDocsPerGroup);
366 groups[downTo] = new GroupDocs<Object>(topDocs.getMaxScore(),
374 while (groupQueue.size() != 0) {
375 final OneGroup og = groupQueue.pop();
376 //System.out.println(" leftover: og ord=" + og.groupOrd + " count=" + og.count);
377 totalGroupedHitCount += og.count;
381 return new TopGroups<Object>(new TopGroups<Object>(groupSort.getSort(),
382 withinGroupSort == null ? null : withinGroupSort.getSort(),
383 totalHitCount, totalGroupedHitCount, groups),
388 public void setScorer(Scorer scorer) throws IOException {
389 this.scorer = scorer;
390 for (FieldComparator comparator : comparators) {
391 comparator.setScorer(scorer);
396 public void collect(int doc) throws IOException {
398 // System.out.println("C " + doc);
400 if (doc > groupEndDocID) {
402 if (subDocUpto != 0) {
405 groupEndDocID = lastDocPerGroupBits.advance(doc);
406 //System.out.println(" adv " + groupEndDocID + " " + lastDocPerGroupBits);
408 groupCompetes = !queueFull;
413 // Always cache doc/score within this group:
414 if (subDocUpto == pendingSubDocs.length) {
415 pendingSubDocs = ArrayUtil.grow(pendingSubDocs);
417 pendingSubDocs[subDocUpto] = doc;
419 if (subDocUpto == pendingSubScores.length) {
420 pendingSubScores = ArrayUtil.grow(pendingSubScores);
422 pendingSubScores[subDocUpto] = scorer.score();
427 if (subDocUpto == 1) {
430 //System.out.println(" init copy to bottomSlot=" + bottomSlot);
431 for (FieldComparator fc : comparators) {
432 fc.copy(bottomSlot, doc);
433 fc.setBottom(bottomSlot);
437 // Compare to bottomSlot
438 for (int compIDX = 0;; compIDX++) {
439 final int c = reversed[compIDX] * comparators[compIDX].compareBottom(doc);
441 // Definitely not competitive -- done
444 // Definitely competitive.
446 } else if (compIDX == compIDXEnd) {
447 // Ties with bottom, except we know this docID is
448 // > docID in the queue (docs are visited in
449 // order), so not competitive:
454 //System.out.println(" best w/in group!");
456 for (FieldComparator fc : comparators) {
457 fc.copy(bottomSlot, doc);
458 // Necessary because some comparators cache
459 // details of bottom slot; this forces them to
461 fc.setBottom(bottomSlot);
466 // We're not sure this group will make it into the
468 for (int compIDX = 0;; compIDX++) {
469 final int c = reversed[compIDX] * comparators[compIDX].compareBottom(doc);
471 // Definitely not competitive -- done
472 //System.out.println(" doc doesn't compete w/ top groups");
475 // Definitely competitive.
477 } else if (compIDX == compIDXEnd) {
478 // Ties with bottom, except we know this docID is
479 // > docID in the queue (docs are visited in
480 // order), so not competitive:
481 //System.out.println(" doc doesn't compete w/ top groups");
485 groupCompetes = true;
486 for (FieldComparator fc : comparators) {
487 fc.copy(bottomSlot, doc);
488 // Necessary because some comparators cache
489 // details of bottom slot; this forces them to
491 fc.setBottom(bottomSlot);
494 //System.out.println(" doc competes w/ top groups");
499 public boolean acceptsDocsOutOfOrder() {
504 public void setNextReader(IndexReader reader, int docBase) throws IOException {
505 if (subDocUpto != 0) {
509 this.docBase = docBase;
510 //System.out.println("setNextReader base=" + docBase + " r=" + readerContext.reader);
511 lastDocPerGroupBits = lastDocPerGroup.getDocIdSet(reader).iterator();
514 currentReader = reader;
515 for (int i=0; i<comparators.length; i++) {
516 comparators[i].setNextReader(reader, docBase);