--- /dev/null
+package org.apache.lucene.search.grouping;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.*;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.*;
+
+/** FirstPassGroupingCollector is the first of two passes necessary
+ * to collect grouped hits. This pass gathers the top N sorted
+ * groups. Concrete subclasses define what a group is and how it
+ * is internally collected.
+ *
+ * <p>See {@link org.apache.lucene.search.grouping} for more
+ * details including a full code example.</p>
+ *
+ * @lucene.experimental
+ */
+abstract public class AbstractFirstPassGroupingCollector<GROUP_VALUE_TYPE> extends Collector {
+
+ private final Sort groupSort;
+ private final FieldComparator[] comparators;
+ private final int[] reversed;
+ private final int topNGroups;
+ private final HashMap<GROUP_VALUE_TYPE, CollectedSearchGroup<GROUP_VALUE_TYPE>> groupMap;
+ private final int compIDXEnd;
+
+ // Set once we reach topNGroups unique groups:
+ private TreeSet<CollectedSearchGroup<GROUP_VALUE_TYPE>> orderedGroups;
+ private int docBase;
+ private int spareSlot;
+
+ /**
+ * Create the first pass collector.
+ *
+ * @param groupSort The {@link Sort} used to sort the
+ * groups. The top sorted document within each group
+ * according to groupSort, determines how that group
+ * sorts against other groups. This must be non-null,
+ * ie, if you want to groupSort by relevance use
+ * Sort.RELEVANCE.
+ * @param topNGroups How many top groups to keep.
+ * @throws IOException If I/O related errors occur
+ */
+ public AbstractFirstPassGroupingCollector(Sort groupSort, int topNGroups) throws IOException {
+ if (topNGroups < 1) {
+ throw new IllegalArgumentException("topNGroups must be >= 1 (got " + topNGroups + ")");
+ }
+
+ // TODO: allow null groupSort to mean "by relevance",
+ // and specialize it?
+ this.groupSort = groupSort;
+
+ this.topNGroups = topNGroups;
+
+ final SortField[] sortFields = groupSort.getSort();
+ comparators = new FieldComparator[sortFields.length];
+ compIDXEnd = comparators.length - 1;
+ reversed = new int[sortFields.length];
+ for (int i = 0; i < sortFields.length; i++) {
+ final SortField sortField = sortFields[i];
+
+ // use topNGroups + 1 so we have a spare slot to use for comparing (tracked by this.spareSlot):
+ comparators[i] = sortField.getComparator(topNGroups + 1, i);
+ reversed[i] = sortField.getReverse() ? -1 : 1;
+ }
+
+ spareSlot = topNGroups;
+ groupMap = new HashMap<GROUP_VALUE_TYPE, CollectedSearchGroup<GROUP_VALUE_TYPE>>(topNGroups);
+ }
+
+ /**
+ * Returns top groups, starting from offset. This may
+ * return null, if no groups were collected, or if the
+ * number of unique groups collected is <= offset.
+ *
+ * @param groupOffset The offset in the collected groups
+ * @param fillFields Whether to fill to {@link SearchGroup#sortValues}
+ * @return top groups, starting from offset
+ */
+ public Collection<SearchGroup<GROUP_VALUE_TYPE>> getTopGroups(int groupOffset, boolean fillFields) {
+
+ //System.out.println("FP.getTopGroups groupOffset=" + groupOffset + " fillFields=" + fillFields + " groupMap.size()=" + groupMap.size());
+
+ if (groupOffset < 0) {
+ throw new IllegalArgumentException("groupOffset must be >= 0 (got " + groupOffset + ")");
+ }
+
+ if (groupMap.size() <= groupOffset) {
+ return null;
+ }
+
+ if (orderedGroups == null) {
+ buildSortedSet();
+ }
+
+ final Collection<SearchGroup<GROUP_VALUE_TYPE>> result = new ArrayList<SearchGroup<GROUP_VALUE_TYPE>>();
+ int upto = 0;
+ final int sortFieldCount = groupSort.getSort().length;
+ for(CollectedSearchGroup<GROUP_VALUE_TYPE> group : orderedGroups) {
+ if (upto++ < groupOffset) {
+ continue;
+ }
+ //System.out.println(" group=" + (group.groupValue == null ? "null" : group.groupValue.utf8ToString()));
+ SearchGroup<GROUP_VALUE_TYPE> searchGroup = new SearchGroup<GROUP_VALUE_TYPE>();
+ searchGroup.groupValue = group.groupValue;
+ if (fillFields) {
+ searchGroup.sortValues = new Object[sortFieldCount];
+ for(int sortFieldIDX=0;sortFieldIDX<sortFieldCount;sortFieldIDX++) {
+ searchGroup.sortValues[sortFieldIDX] = comparators[sortFieldIDX].value(group.comparatorSlot);
+ }
+ }
+ result.add(searchGroup);
+ }
+ //System.out.println(" return " + result.size() + " groups");
+ return result;
+ }
+
+ @Override
+ public void setScorer(Scorer scorer) throws IOException {
+ for (FieldComparator comparator : comparators) {
+ comparator.setScorer(scorer);
+ }
+ }
+
+ @Override
+ public void collect(int doc) throws IOException {
+ //System.out.println("FP.collect doc=" + doc);
+
+ // If orderedGroups != null we already have collected N groups and
+ // can short circuit by comparing this document to the bottom group,
+ // without having to find what group this document belongs to.
+
+ // Even if this document belongs to a group in the top N, we'll know that
+ // we don't have to update that group.
+
+ // Downside: if the number of unique groups is very low, this is
+ // wasted effort as we will most likely be updating an existing group.
+ if (orderedGroups != null) {
+ for (int compIDX = 0;; compIDX++) {
+ final int c = reversed[compIDX] * comparators[compIDX].compareBottom(doc);
+ if (c < 0) {
+ // Definitely not competitive. So don't even bother to continue
+ return;
+ } else if (c > 0) {
+ // Definitely competitive.
+ break;
+ } else if (compIDX == compIDXEnd) {
+ // Here c=0. If we're at the last comparator, this doc is not
+ // competitive, since docs are visited in doc Id order, which means
+ // this doc cannot compete with any other document in the queue.
+ return;
+ }
+ }
+ }
+
+ // TODO: should we add option to mean "ignore docs that
+ // don't have the group field" (instead of stuffing them
+ // under null group)?
+ final GROUP_VALUE_TYPE groupValue = getDocGroupValue(doc);
+
+ final CollectedSearchGroup<GROUP_VALUE_TYPE> group = groupMap.get(groupValue);
+
+ if (group == null) {
+
+ // First time we are seeing this group, or, we've seen
+ // it before but it fell out of the top N and is now
+ // coming back
+
+ if (groupMap.size() < topNGroups) {
+
+ // Still in startup transient: we have not
+ // seen enough unique groups to start pruning them;
+ // just keep collecting them
+
+ // Add a new CollectedSearchGroup:
+ CollectedSearchGroup<GROUP_VALUE_TYPE> sg = new CollectedSearchGroup<GROUP_VALUE_TYPE>();
+ sg.groupValue = copyDocGroupValue(groupValue, null);
+ sg.comparatorSlot = groupMap.size();
+ sg.topDoc = docBase + doc;
+ for (FieldComparator fc : comparators) {
+ fc.copy(sg.comparatorSlot, doc);
+ }
+ groupMap.put(sg.groupValue, sg);
+
+ if (groupMap.size() == topNGroups) {
+ // End of startup transient: we now have max
+ // number of groups; from here on we will drop
+ // bottom group when we insert new one:
+ buildSortedSet();
+ }
+
+ return;
+ }
+
+ // We already tested that the document is competitive, so replace
+ // the bottom group with this new group.
+
+ // java 6-only: final CollectedSearchGroup bottomGroup = orderedGroups.pollLast();
+ final CollectedSearchGroup<GROUP_VALUE_TYPE> bottomGroup = orderedGroups.last();
+ orderedGroups.remove(bottomGroup);
+ assert orderedGroups.size() == topNGroups -1;
+
+ groupMap.remove(bottomGroup.groupValue);
+
+ // reuse the removed CollectedSearchGroup
+ bottomGroup.groupValue = copyDocGroupValue(groupValue, bottomGroup.groupValue);
+ bottomGroup.topDoc = docBase + doc;
+
+ for (FieldComparator fc : comparators) {
+ fc.copy(bottomGroup.comparatorSlot, doc);
+ }
+
+ groupMap.put(bottomGroup.groupValue, bottomGroup);
+ orderedGroups.add(bottomGroup);
+ assert orderedGroups.size() == topNGroups;
+
+ final int lastComparatorSlot = orderedGroups.last().comparatorSlot;
+ for (FieldComparator fc : comparators) {
+ fc.setBottom(lastComparatorSlot);
+ }
+
+ return;
+ }
+
+ // Update existing group:
+ for (int compIDX = 0;; compIDX++) {
+ final FieldComparator fc = comparators[compIDX];
+ fc.copy(spareSlot, doc);
+
+ final int c = reversed[compIDX] * fc.compare(group.comparatorSlot, spareSlot);
+ if (c < 0) {
+ // Definitely not competitive.
+ return;
+ } else if (c > 0) {
+ // Definitely competitive; set remaining comparators:
+ for (int compIDX2=compIDX+1; compIDX2<comparators.length; compIDX2++) {
+ comparators[compIDX2].copy(spareSlot, doc);
+ }
+ break;
+ } else if (compIDX == compIDXEnd) {
+ // Here c=0. If we're at the last comparator, this doc is not
+ // competitive, since docs are visited in doc Id order, which means
+ // this doc cannot compete with any other document in the queue.
+ return;
+ }
+ }
+
+ // Remove before updating the group since lookup is done via comparators
+ // TODO: optimize this
+
+ final CollectedSearchGroup<GROUP_VALUE_TYPE> prevLast;
+ if (orderedGroups != null) {
+ prevLast = orderedGroups.last();
+ orderedGroups.remove(group);
+ assert orderedGroups.size() == topNGroups-1;
+ } else {
+ prevLast = null;
+ }
+
+ group.topDoc = docBase + doc;
+
+ // Swap slots
+ final int tmp = spareSlot;
+ spareSlot = group.comparatorSlot;
+ group.comparatorSlot = tmp;
+
+ // Re-add the changed group
+ if (orderedGroups != null) {
+ orderedGroups.add(group);
+ assert orderedGroups.size() == topNGroups;
+ final CollectedSearchGroup newLast = orderedGroups.last();
+ // If we changed the value of the last group, or changed which group was last, then update bottom:
+ if (group == newLast || prevLast != newLast) {
+ for (FieldComparator fc : comparators) {
+ fc.setBottom(newLast.comparatorSlot);
+ }
+ }
+ }
+ }
+
+ private void buildSortedSet() {
+ final Comparator<CollectedSearchGroup> comparator = new Comparator<CollectedSearchGroup>() {
+ public int compare(CollectedSearchGroup o1, CollectedSearchGroup o2) {
+ for (int compIDX = 0;; compIDX++) {
+ FieldComparator fc = comparators[compIDX];
+ final int c = reversed[compIDX] * fc.compare(o1.comparatorSlot, o2.comparatorSlot);
+ if (c != 0) {
+ return c;
+ } else if (compIDX == compIDXEnd) {
+ return o1.topDoc - o2.topDoc;
+ }
+ }
+ }
+ };
+
+ orderedGroups = new TreeSet<CollectedSearchGroup<GROUP_VALUE_TYPE>>(comparator);
+ orderedGroups.addAll(groupMap.values());
+ assert orderedGroups.size() > 0;
+
+ for (FieldComparator fc : comparators) {
+ fc.setBottom(orderedGroups.last().comparatorSlot);
+ }
+ }
+
+ @Override
+ public boolean acceptsDocsOutOfOrder() {
+ return false;
+ }
+
+ @Override
+ public void setNextReader(IndexReader reader, int docBase) throws IOException {
+ this.docBase = docBase;
+ for (int i=0; i<comparators.length; i++) {
+ comparators[i].setNextReader(reader, docBase);
+ }
+ }
+
+ /**
+ * Returns the group value for the specified doc.
+ *
+ * @param doc The specified doc
+ * @return the group value for the specified doc
+ */
+ protected abstract GROUP_VALUE_TYPE getDocGroupValue(int doc);
+
+ /**
+ * Returns a copy of the specified group value by creating a new instance and copying the value from the specified
+ * groupValue in the new instance. Or optionally the reuse argument can be used to copy the group value in.
+ *
+ * @param groupValue The group value to copy
+ * @param reuse Optionally a reuse instance to prevent a new instance creation
+ * @return a copy of the specified group value
+ */
+ protected abstract GROUP_VALUE_TYPE copyDocGroupValue(GROUP_VALUE_TYPE groupValue, GROUP_VALUE_TYPE reuse);
+}
+
+class CollectedSearchGroup<T> extends SearchGroup<T> {
+ int topDoc;
+ int comparatorSlot;
+}