1 package org.apache.lucene.search.grouping;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
23 import org.apache.lucene.index.IndexReader;
24 import org.apache.lucene.search.*;
26 /** FirstPassGroupingCollector is the first of two passes necessary
27 * to collect grouped hits. This pass gathers the top N sorted
28 * groups. Concrete subclasses define what a group is and how it
29 * is internally collected.
31 * <p>See {@link org.apache.lucene.search.grouping} for more
32 * details including a full code example.</p>
34 * @lucene.experimental
36 abstract public class AbstractFirstPassGroupingCollector<GROUP_VALUE_TYPE> extends Collector {
38 private final Sort groupSort;
39 private final FieldComparator[] comparators;
40 private final int[] reversed;
41 private final int topNGroups;
42 private final HashMap<GROUP_VALUE_TYPE, CollectedSearchGroup<GROUP_VALUE_TYPE>> groupMap;
43 private final int compIDXEnd;
45 // Set once we reach topNGroups unique groups:
46 private TreeSet<CollectedSearchGroup<GROUP_VALUE_TYPE>> orderedGroups;
48 private int spareSlot;
51 * Create the first pass collector.
53 * @param groupSort The {@link Sort} used to sort the
54 * groups. The top sorted document within each group
55 * according to groupSort, determines how that group
56 * sorts against other groups. This must be non-null,
57 * ie, if you want to groupSort by relevance use
59 * @param topNGroups How many top groups to keep.
60 * @throws IOException If I/O related errors occur
62 public AbstractFirstPassGroupingCollector(Sort groupSort, int topNGroups) throws IOException {
64 throw new IllegalArgumentException("topNGroups must be >= 1 (got " + topNGroups + ")");
67 // TODO: allow null groupSort to mean "by relevance",
69 this.groupSort = groupSort;
71 this.topNGroups = topNGroups;
73 final SortField[] sortFields = groupSort.getSort();
74 comparators = new FieldComparator[sortFields.length];
75 compIDXEnd = comparators.length - 1;
76 reversed = new int[sortFields.length];
77 for (int i = 0; i < sortFields.length; i++) {
78 final SortField sortField = sortFields[i];
80 // use topNGroups + 1 so we have a spare slot to use for comparing (tracked by this.spareSlot):
81 comparators[i] = sortField.getComparator(topNGroups + 1, i);
82 reversed[i] = sortField.getReverse() ? -1 : 1;
85 spareSlot = topNGroups;
86 groupMap = new HashMap<GROUP_VALUE_TYPE, CollectedSearchGroup<GROUP_VALUE_TYPE>>(topNGroups);
90 * Returns top groups, starting from offset. This may
91 * return null, if no groups were collected, or if the
92 * number of unique groups collected is <= offset.
94 * @param groupOffset The offset in the collected groups
95 * @param fillFields Whether to fill to {@link SearchGroup#sortValues}
96 * @return top groups, starting from offset
98 public Collection<SearchGroup<GROUP_VALUE_TYPE>> getTopGroups(int groupOffset, boolean fillFields) {
100 //System.out.println("FP.getTopGroups groupOffset=" + groupOffset + " fillFields=" + fillFields + " groupMap.size()=" + groupMap.size());
102 if (groupOffset < 0) {
103 throw new IllegalArgumentException("groupOffset must be >= 0 (got " + groupOffset + ")");
106 if (groupMap.size() <= groupOffset) {
110 if (orderedGroups == null) {
114 final Collection<SearchGroup<GROUP_VALUE_TYPE>> result = new ArrayList<SearchGroup<GROUP_VALUE_TYPE>>();
116 final int sortFieldCount = groupSort.getSort().length;
117 for(CollectedSearchGroup<GROUP_VALUE_TYPE> group : orderedGroups) {
118 if (upto++ < groupOffset) {
121 //System.out.println(" group=" + (group.groupValue == null ? "null" : group.groupValue.utf8ToString()));
122 SearchGroup<GROUP_VALUE_TYPE> searchGroup = new SearchGroup<GROUP_VALUE_TYPE>();
123 searchGroup.groupValue = group.groupValue;
125 searchGroup.sortValues = new Object[sortFieldCount];
126 for(int sortFieldIDX=0;sortFieldIDX<sortFieldCount;sortFieldIDX++) {
127 searchGroup.sortValues[sortFieldIDX] = comparators[sortFieldIDX].value(group.comparatorSlot);
130 result.add(searchGroup);
132 //System.out.println(" return " + result.size() + " groups");
137 public void setScorer(Scorer scorer) throws IOException {
138 for (FieldComparator comparator : comparators) {
139 comparator.setScorer(scorer);
144 public void collect(int doc) throws IOException {
145 //System.out.println("FP.collect doc=" + doc);
147 // If orderedGroups != null we already have collected N groups and
148 // can short circuit by comparing this document to the bottom group,
149 // without having to find what group this document belongs to.
151 // Even if this document belongs to a group in the top N, we'll know that
152 // we don't have to update that group.
154 // Downside: if the number of unique groups is very low, this is
155 // wasted effort as we will most likely be updating an existing group.
156 if (orderedGroups != null) {
157 for (int compIDX = 0;; compIDX++) {
158 final int c = reversed[compIDX] * comparators[compIDX].compareBottom(doc);
160 // Definitely not competitive. So don't even bother to continue
163 // Definitely competitive.
165 } else if (compIDX == compIDXEnd) {
166 // Here c=0. If we're at the last comparator, this doc is not
167 // competitive, since docs are visited in doc Id order, which means
168 // this doc cannot compete with any other document in the queue.
174 // TODO: should we add option to mean "ignore docs that
175 // don't have the group field" (instead of stuffing them
176 // under null group)?
177 final GROUP_VALUE_TYPE groupValue = getDocGroupValue(doc);
179 final CollectedSearchGroup<GROUP_VALUE_TYPE> group = groupMap.get(groupValue);
183 // First time we are seeing this group, or, we've seen
184 // it before but it fell out of the top N and is now
187 if (groupMap.size() < topNGroups) {
189 // Still in startup transient: we have not
190 // seen enough unique groups to start pruning them;
191 // just keep collecting them
193 // Add a new CollectedSearchGroup:
194 CollectedSearchGroup<GROUP_VALUE_TYPE> sg = new CollectedSearchGroup<GROUP_VALUE_TYPE>();
195 sg.groupValue = copyDocGroupValue(groupValue, null);
196 sg.comparatorSlot = groupMap.size();
197 sg.topDoc = docBase + doc;
198 for (FieldComparator fc : comparators) {
199 fc.copy(sg.comparatorSlot, doc);
201 groupMap.put(sg.groupValue, sg);
203 if (groupMap.size() == topNGroups) {
204 // End of startup transient: we now have max
205 // number of groups; from here on we will drop
206 // bottom group when we insert new one:
213 // We already tested that the document is competitive, so replace
214 // the bottom group with this new group.
216 // java 6-only: final CollectedSearchGroup bottomGroup = orderedGroups.pollLast();
217 final CollectedSearchGroup<GROUP_VALUE_TYPE> bottomGroup = orderedGroups.last();
218 orderedGroups.remove(bottomGroup);
219 assert orderedGroups.size() == topNGroups -1;
221 groupMap.remove(bottomGroup.groupValue);
223 // reuse the removed CollectedSearchGroup
224 bottomGroup.groupValue = copyDocGroupValue(groupValue, bottomGroup.groupValue);
225 bottomGroup.topDoc = docBase + doc;
227 for (FieldComparator fc : comparators) {
228 fc.copy(bottomGroup.comparatorSlot, doc);
231 groupMap.put(bottomGroup.groupValue, bottomGroup);
232 orderedGroups.add(bottomGroup);
233 assert orderedGroups.size() == topNGroups;
235 final int lastComparatorSlot = orderedGroups.last().comparatorSlot;
236 for (FieldComparator fc : comparators) {
237 fc.setBottom(lastComparatorSlot);
243 // Update existing group:
244 for (int compIDX = 0;; compIDX++) {
245 final FieldComparator fc = comparators[compIDX];
246 fc.copy(spareSlot, doc);
248 final int c = reversed[compIDX] * fc.compare(group.comparatorSlot, spareSlot);
250 // Definitely not competitive.
253 // Definitely competitive; set remaining comparators:
254 for (int compIDX2=compIDX+1; compIDX2<comparators.length; compIDX2++) {
255 comparators[compIDX2].copy(spareSlot, doc);
258 } else if (compIDX == compIDXEnd) {
259 // Here c=0. If we're at the last comparator, this doc is not
260 // competitive, since docs are visited in doc Id order, which means
261 // this doc cannot compete with any other document in the queue.
266 // Remove before updating the group since lookup is done via comparators
267 // TODO: optimize this
269 final CollectedSearchGroup<GROUP_VALUE_TYPE> prevLast;
270 if (orderedGroups != null) {
271 prevLast = orderedGroups.last();
272 orderedGroups.remove(group);
273 assert orderedGroups.size() == topNGroups-1;
278 group.topDoc = docBase + doc;
281 final int tmp = spareSlot;
282 spareSlot = group.comparatorSlot;
283 group.comparatorSlot = tmp;
285 // Re-add the changed group
286 if (orderedGroups != null) {
287 orderedGroups.add(group);
288 assert orderedGroups.size() == topNGroups;
289 final CollectedSearchGroup newLast = orderedGroups.last();
290 // If we changed the value of the last group, or changed which group was last, then update bottom:
291 if (group == newLast || prevLast != newLast) {
292 for (FieldComparator fc : comparators) {
293 fc.setBottom(newLast.comparatorSlot);
299 private void buildSortedSet() {
300 final Comparator<CollectedSearchGroup> comparator = new Comparator<CollectedSearchGroup>() {
301 public int compare(CollectedSearchGroup o1, CollectedSearchGroup o2) {
302 for (int compIDX = 0;; compIDX++) {
303 FieldComparator fc = comparators[compIDX];
304 final int c = reversed[compIDX] * fc.compare(o1.comparatorSlot, o2.comparatorSlot);
307 } else if (compIDX == compIDXEnd) {
308 return o1.topDoc - o2.topDoc;
314 orderedGroups = new TreeSet<CollectedSearchGroup<GROUP_VALUE_TYPE>>(comparator);
315 orderedGroups.addAll(groupMap.values());
316 assert orderedGroups.size() > 0;
318 for (FieldComparator fc : comparators) {
319 fc.setBottom(orderedGroups.last().comparatorSlot);
324 public boolean acceptsDocsOutOfOrder() {
329 public void setNextReader(IndexReader reader, int docBase) throws IOException {
330 this.docBase = docBase;
331 for (int i=0; i<comparators.length; i++) {
332 comparators[i].setNextReader(reader, docBase);
337 * Returns the group value for the specified doc.
339 * @param doc The specified doc
340 * @return the group value for the specified doc
342 protected abstract GROUP_VALUE_TYPE getDocGroupValue(int doc);
345 * Returns a copy of the specified group value by creating a new instance and copying the value from the specified
346 * groupValue in the new instance. Or optionally the reuse argument can be used to copy the group value in.
348 * @param groupValue The group value to copy
349 * @param reuse Optionally a reuse instance to prevent a new instance creation
350 * @return a copy of the specified group value
352 protected abstract GROUP_VALUE_TYPE copyDocGroupValue(GROUP_VALUE_TYPE groupValue, GROUP_VALUE_TYPE reuse);
355 class CollectedSearchGroup<T> extends SearchGroup<T> {