X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/AbstractFirstPassGroupingCollector.java diff --git a/lucene-java-3.5.0/lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/AbstractFirstPassGroupingCollector.java b/lucene-java-3.5.0/lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/AbstractFirstPassGroupingCollector.java new file mode 100644 index 0000000..fb4b660 --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/AbstractFirstPassGroupingCollector.java @@ -0,0 +1,358 @@ +package org.apache.lucene.search.grouping; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.*; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.*; + +/** FirstPassGroupingCollector is the first of two passes necessary + * to collect grouped hits. This pass gathers the top N sorted + * groups. Concrete subclasses define what a group is and how it + * is internally collected. + * + *

See {@link org.apache.lucene.search.grouping} for more + * details including a full code example.

+ * + * @lucene.experimental + */ +abstract public class AbstractFirstPassGroupingCollector extends Collector { + + private final Sort groupSort; + private final FieldComparator[] comparators; + private final int[] reversed; + private final int topNGroups; + private final HashMap> groupMap; + private final int compIDXEnd; + + // Set once we reach topNGroups unique groups: + private TreeSet> orderedGroups; + private int docBase; + private int spareSlot; + + /** + * Create the first pass collector. + * + * @param groupSort The {@link Sort} used to sort the + * groups. The top sorted document within each group + * according to groupSort, determines how that group + * sorts against other groups. This must be non-null, + * ie, if you want to groupSort by relevance use + * Sort.RELEVANCE. + * @param topNGroups How many top groups to keep. + * @throws IOException If I/O related errors occur + */ + public AbstractFirstPassGroupingCollector(Sort groupSort, int topNGroups) throws IOException { + if (topNGroups < 1) { + throw new IllegalArgumentException("topNGroups must be >= 1 (got " + topNGroups + ")"); + } + + // TODO: allow null groupSort to mean "by relevance", + // and specialize it? + this.groupSort = groupSort; + + this.topNGroups = topNGroups; + + final SortField[] sortFields = groupSort.getSort(); + comparators = new FieldComparator[sortFields.length]; + compIDXEnd = comparators.length - 1; + reversed = new int[sortFields.length]; + for (int i = 0; i < sortFields.length; i++) { + final SortField sortField = sortFields[i]; + + // use topNGroups + 1 so we have a spare slot to use for comparing (tracked by this.spareSlot): + comparators[i] = sortField.getComparator(topNGroups + 1, i); + reversed[i] = sortField.getReverse() ? -1 : 1; + } + + spareSlot = topNGroups; + groupMap = new HashMap>(topNGroups); + } + + /** + * Returns top groups, starting from offset. This may + * return null, if no groups were collected, or if the + * number of unique groups collected is <= offset. + * + * @param groupOffset The offset in the collected groups + * @param fillFields Whether to fill to {@link SearchGroup#sortValues} + * @return top groups, starting from offset + */ + public Collection> getTopGroups(int groupOffset, boolean fillFields) { + + //System.out.println("FP.getTopGroups groupOffset=" + groupOffset + " fillFields=" + fillFields + " groupMap.size()=" + groupMap.size()); + + if (groupOffset < 0) { + throw new IllegalArgumentException("groupOffset must be >= 0 (got " + groupOffset + ")"); + } + + if (groupMap.size() <= groupOffset) { + return null; + } + + if (orderedGroups == null) { + buildSortedSet(); + } + + final Collection> result = new ArrayList>(); + int upto = 0; + final int sortFieldCount = groupSort.getSort().length; + for(CollectedSearchGroup group : orderedGroups) { + if (upto++ < groupOffset) { + continue; + } + //System.out.println(" group=" + (group.groupValue == null ? "null" : group.groupValue.utf8ToString())); + SearchGroup searchGroup = new SearchGroup(); + searchGroup.groupValue = group.groupValue; + if (fillFields) { + searchGroup.sortValues = new Object[sortFieldCount]; + for(int sortFieldIDX=0;sortFieldIDX group = groupMap.get(groupValue); + + if (group == null) { + + // First time we are seeing this group, or, we've seen + // it before but it fell out of the top N and is now + // coming back + + if (groupMap.size() < topNGroups) { + + // Still in startup transient: we have not + // seen enough unique groups to start pruning them; + // just keep collecting them + + // Add a new CollectedSearchGroup: + CollectedSearchGroup sg = new CollectedSearchGroup(); + sg.groupValue = copyDocGroupValue(groupValue, null); + sg.comparatorSlot = groupMap.size(); + sg.topDoc = docBase + doc; + for (FieldComparator fc : comparators) { + fc.copy(sg.comparatorSlot, doc); + } + groupMap.put(sg.groupValue, sg); + + if (groupMap.size() == topNGroups) { + // End of startup transient: we now have max + // number of groups; from here on we will drop + // bottom group when we insert new one: + buildSortedSet(); + } + + return; + } + + // We already tested that the document is competitive, so replace + // the bottom group with this new group. + + // java 6-only: final CollectedSearchGroup bottomGroup = orderedGroups.pollLast(); + final CollectedSearchGroup bottomGroup = orderedGroups.last(); + orderedGroups.remove(bottomGroup); + assert orderedGroups.size() == topNGroups -1; + + groupMap.remove(bottomGroup.groupValue); + + // reuse the removed CollectedSearchGroup + bottomGroup.groupValue = copyDocGroupValue(groupValue, bottomGroup.groupValue); + bottomGroup.topDoc = docBase + doc; + + for (FieldComparator fc : comparators) { + fc.copy(bottomGroup.comparatorSlot, doc); + } + + groupMap.put(bottomGroup.groupValue, bottomGroup); + orderedGroups.add(bottomGroup); + assert orderedGroups.size() == topNGroups; + + final int lastComparatorSlot = orderedGroups.last().comparatorSlot; + for (FieldComparator fc : comparators) { + fc.setBottom(lastComparatorSlot); + } + + return; + } + + // Update existing group: + for (int compIDX = 0;; compIDX++) { + final FieldComparator fc = comparators[compIDX]; + fc.copy(spareSlot, doc); + + final int c = reversed[compIDX] * fc.compare(group.comparatorSlot, spareSlot); + if (c < 0) { + // Definitely not competitive. + return; + } else if (c > 0) { + // Definitely competitive; set remaining comparators: + for (int compIDX2=compIDX+1; compIDX2 prevLast; + if (orderedGroups != null) { + prevLast = orderedGroups.last(); + orderedGroups.remove(group); + assert orderedGroups.size() == topNGroups-1; + } else { + prevLast = null; + } + + group.topDoc = docBase + doc; + + // Swap slots + final int tmp = spareSlot; + spareSlot = group.comparatorSlot; + group.comparatorSlot = tmp; + + // Re-add the changed group + if (orderedGroups != null) { + orderedGroups.add(group); + assert orderedGroups.size() == topNGroups; + final CollectedSearchGroup newLast = orderedGroups.last(); + // If we changed the value of the last group, or changed which group was last, then update bottom: + if (group == newLast || prevLast != newLast) { + for (FieldComparator fc : comparators) { + fc.setBottom(newLast.comparatorSlot); + } + } + } + } + + private void buildSortedSet() { + final Comparator comparator = new Comparator() { + public int compare(CollectedSearchGroup o1, CollectedSearchGroup o2) { + for (int compIDX = 0;; compIDX++) { + FieldComparator fc = comparators[compIDX]; + final int c = reversed[compIDX] * fc.compare(o1.comparatorSlot, o2.comparatorSlot); + if (c != 0) { + return c; + } else if (compIDX == compIDXEnd) { + return o1.topDoc - o2.topDoc; + } + } + } + }; + + orderedGroups = new TreeSet>(comparator); + orderedGroups.addAll(groupMap.values()); + assert orderedGroups.size() > 0; + + for (FieldComparator fc : comparators) { + fc.setBottom(orderedGroups.last().comparatorSlot); + } + } + + @Override + public boolean acceptsDocsOutOfOrder() { + return false; + } + + @Override + public void setNextReader(IndexReader reader, int docBase) throws IOException { + this.docBase = docBase; + for (int i=0; i extends SearchGroup { + int topDoc; + int comparatorSlot; +}