lucene-java-3.5.0/lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/AbstractFirstPassGroupingCollector.java

   1 package org.apache.lucene.search.grouping;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.util.*;
  22
  23 import org.apache.lucene.index.IndexReader;
  24 import org.apache.lucene.search.*;
  25
  26 /** FirstPassGroupingCollector is the first of two passes necessary
  27  *  to collect grouped hits.  This pass gathers the top N sorted
  28  *  groups. Concrete subclasses define what a group is and how it
  29  *  is internally collected.
  30  *
  31  *  <p>See {@link org.apache.lucene.search.grouping} for more
  32  *  details including a full code example.</p>
  33  *
  34  * @lucene.experimental
  35  */
  36 abstract public class AbstractFirstPassGroupingCollector<GROUP_VALUE_TYPE> extends Collector {
  37
  38   private final Sort groupSort;
  39   private final FieldComparator[] comparators;
  40   private final int[] reversed;
  41   private final int topNGroups;
  42   private final HashMap<GROUP_VALUE_TYPE, CollectedSearchGroup<GROUP_VALUE_TYPE>> groupMap;
  43   private final int compIDXEnd;
  44
  45   // Set once we reach topNGroups unique groups:
  46   private TreeSet<CollectedSearchGroup<GROUP_VALUE_TYPE>> orderedGroups;
  47   private int docBase;
  48   private int spareSlot;
  49
  50   /**
  51    * Create the first pass collector.
  52    *
  53    *  @param groupSort The {@link Sort} used to sort the
  54    *    groups.  The top sorted document within each group
  55    *    according to groupSort, determines how that group
  56    *    sorts against other groups.  This must be non-null,
  57    *    ie, if you want to groupSort by relevance use
  58    *    Sort.RELEVANCE.
  59    *  @param topNGroups How many top groups to keep.
  60    *  @throws IOException If I/O related errors occur
  61    */
  62   public AbstractFirstPassGroupingCollector(Sort groupSort, int topNGroups) throws IOException {
  63     if (topNGroups < 1) {
  64       throw new IllegalArgumentException("topNGroups must be >= 1 (got " + topNGroups + ")");
  65     }
  66
  67     // TODO: allow null groupSort to mean "by relevance",
  68     // and specialize it?
  69     this.groupSort = groupSort;
  70
  71     this.topNGroups = topNGroups;
  72
  73     final SortField[] sortFields = groupSort.getSort();
  74     comparators = new FieldComparator[sortFields.length];
  75     compIDXEnd = comparators.length - 1;
  76     reversed = new int[sortFields.length];
  77     for (int i = 0; i < sortFields.length; i++) {
  78       final SortField sortField = sortFields[i];
  79
  80       // use topNGroups + 1 so we have a spare slot to use for comparing (tracked by this.spareSlot):
  81       comparators[i] = sortField.getComparator(topNGroups + 1, i);
  82       reversed[i] = sortField.getReverse() ? -1 : 1;
  83     }
  84
  85     spareSlot = topNGroups;
  86     groupMap = new HashMap<GROUP_VALUE_TYPE, CollectedSearchGroup<GROUP_VALUE_TYPE>>(topNGroups);
  87   }
  88
  89   /**
  90    * Returns top groups, starting from offset.  This may
  91    * return null, if no groups were collected, or if the
  92    * number of unique groups collected is <= offset.
  93    *
  94    * @param groupOffset The offset in the collected groups
  95    * @param fillFields Whether to fill to {@link SearchGroup#sortValues}
  96    * @return top groups, starting from offset
  97    */
  98   public Collection<SearchGroup<GROUP_VALUE_TYPE>> getTopGroups(int groupOffset, boolean fillFields) {
  99
 100     //System.out.println("FP.getTopGroups groupOffset=" + groupOffset + " fillFields=" + fillFields + " groupMap.size()=" + groupMap.size());
 101
 102     if (groupOffset < 0) {
 103       throw new IllegalArgumentException("groupOffset must be >= 0 (got " + groupOffset + ")");
 104     }
 105
 106     if (groupMap.size() <= groupOffset) {
 107       return null;
 108     }
 109
 110     if (orderedGroups == null) {
 111       buildSortedSet();
 112     }
 113
 114     final Collection<SearchGroup<GROUP_VALUE_TYPE>> result = new ArrayList<SearchGroup<GROUP_VALUE_TYPE>>();
 115     int upto = 0;
 116     final int sortFieldCount = groupSort.getSort().length;
 117     for(CollectedSearchGroup<GROUP_VALUE_TYPE> group : orderedGroups) {
 118       if (upto++ < groupOffset) {
 119         continue;
 120       }
 121       //System.out.println("  group=" + (group.groupValue == null ? "null" : group.groupValue.utf8ToString()));
 122       SearchGroup<GROUP_VALUE_TYPE> searchGroup = new SearchGroup<GROUP_VALUE_TYPE>();
 123       searchGroup.groupValue = group.groupValue;
 124       if (fillFields) {
 125         searchGroup.sortValues = new Object[sortFieldCount];
 126         for(int sortFieldIDX=0;sortFieldIDX<sortFieldCount;sortFieldIDX++) {
 127           searchGroup.sortValues[sortFieldIDX] = comparators[sortFieldIDX].value(group.comparatorSlot);
 128         }
 129       }
 130       result.add(searchGroup);
 131     }
 132     //System.out.println("  return " + result.size() + " groups");
 133     return result;
 134   }
 135
 136   @Override
 137   public void setScorer(Scorer scorer) throws IOException {
 138     for (FieldComparator comparator : comparators) {
 139       comparator.setScorer(scorer);
 140     }
 141   }
 142
 143   @Override
 144   public void collect(int doc) throws IOException {
 145     //System.out.println("FP.collect doc=" + doc);
 146
 147     // If orderedGroups != null we already have collected N groups and
 148     // can short circuit by comparing this document to the bottom group,
 149     // without having to find what group this document belongs to.
 150
 151     // Even if this document belongs to a group in the top N, we'll know that
 152     // we don't have to update that group.
 153
 154     // Downside: if the number of unique groups is very low, this is
 155     // wasted effort as we will most likely be updating an existing group.
 156     if (orderedGroups != null) {
 157       for (int compIDX = 0;; compIDX++) {
 158         final int c = reversed[compIDX] * comparators[compIDX].compareBottom(doc);
 159         if (c < 0) {
 160           // Definitely not competitive. So don't even bother to continue
 161           return;
 162         } else if (c > 0) {
 163           // Definitely competitive.
 164           break;
 165         } else if (compIDX == compIDXEnd) {
 166           // Here c=0. If we're at the last comparator, this doc is not
 167           // competitive, since docs are visited in doc Id order, which means
 168           // this doc cannot compete with any other document in the queue.
 169           return;
 170         }
 171       }
 172     }
 173
 174     // TODO: should we add option to mean "ignore docs that
 175     // don't have the group field" (instead of stuffing them
 176     // under null group)?
 177     final GROUP_VALUE_TYPE groupValue = getDocGroupValue(doc);
 178
 179     final CollectedSearchGroup<GROUP_VALUE_TYPE> group = groupMap.get(groupValue);
 180
 181     if (group == null) {
 182
 183       // First time we are seeing this group, or, we've seen
 184       // it before but it fell out of the top N and is now
 185       // coming back
 186
 187       if (groupMap.size() < topNGroups) {
 188
 189         // Still in startup transient: we have not
 190         // seen enough unique groups to start pruning them;
 191         // just keep collecting them
 192
 193         // Add a new CollectedSearchGroup:
 194         CollectedSearchGroup<GROUP_VALUE_TYPE> sg = new CollectedSearchGroup<GROUP_VALUE_TYPE>();
 195         sg.groupValue = copyDocGroupValue(groupValue, null);
 196         sg.comparatorSlot = groupMap.size();
 197         sg.topDoc = docBase + doc;
 198         for (FieldComparator fc : comparators) {
 199           fc.copy(sg.comparatorSlot, doc);
 200         }
 201         groupMap.put(sg.groupValue, sg);
 202
 203         if (groupMap.size() == topNGroups) {
 204           // End of startup transient: we now have max
 205           // number of groups; from here on we will drop
 206           // bottom group when we insert new one:
 207           buildSortedSet();
 208         }
 209
 210         return;
 211       }
 212
 213       // We already tested that the document is competitive, so replace
 214       // the bottom group with this new group.
 215
 216       // java 6-only: final CollectedSearchGroup bottomGroup = orderedGroups.pollLast();
 217       final CollectedSearchGroup<GROUP_VALUE_TYPE> bottomGroup = orderedGroups.last();
 218       orderedGroups.remove(bottomGroup);
 219       assert orderedGroups.size() == topNGroups -1;
 220
 221       groupMap.remove(bottomGroup.groupValue);
 222
 223       // reuse the removed CollectedSearchGroup
 224       bottomGroup.groupValue = copyDocGroupValue(groupValue, bottomGroup.groupValue);
 225       bottomGroup.topDoc = docBase + doc;
 226
 227       for (FieldComparator fc : comparators) {
 228         fc.copy(bottomGroup.comparatorSlot, doc);
 229       }
 230
 231       groupMap.put(bottomGroup.groupValue, bottomGroup);
 232       orderedGroups.add(bottomGroup);
 233       assert orderedGroups.size() == topNGroups;
 234
 235       final int lastComparatorSlot = orderedGroups.last().comparatorSlot;
 236       for (FieldComparator fc : comparators) {
 237         fc.setBottom(lastComparatorSlot);
 238       }
 239
 240       return;
 241     }
 242
 243     // Update existing group:
 244     for (int compIDX = 0;; compIDX++) {
 245       final FieldComparator fc = comparators[compIDX];
 246       fc.copy(spareSlot, doc);
 247
 248       final int c = reversed[compIDX] * fc.compare(group.comparatorSlot, spareSlot);
 249       if (c < 0) {
 250         // Definitely not competitive.
 251         return;
 252       } else if (c > 0) {
 253         // Definitely competitive; set remaining comparators:
 254         for (int compIDX2=compIDX+1; compIDX2<comparators.length; compIDX2++) {
 255           comparators[compIDX2].copy(spareSlot, doc);
 256         }
 257         break;
 258       } else if (compIDX == compIDXEnd) {
 259         // Here c=0. If we're at the last comparator, this doc is not
 260         // competitive, since docs are visited in doc Id order, which means
 261         // this doc cannot compete with any other document in the queue.
 262         return;
 263       }
 264     }
 265
 266     // Remove before updating the group since lookup is done via comparators
 267     // TODO: optimize this
 268
 269     final CollectedSearchGroup<GROUP_VALUE_TYPE> prevLast;
 270     if (orderedGroups != null) {
 271       prevLast = orderedGroups.last();
 272       orderedGroups.remove(group);
 273       assert orderedGroups.size() == topNGroups-1;
 274     } else {
 275       prevLast = null;
 276     }
 277
 278     group.topDoc = docBase + doc;
 279
 280     // Swap slots
 281     final int tmp = spareSlot;
 282     spareSlot = group.comparatorSlot;
 283     group.comparatorSlot = tmp;
 284
 285     // Re-add the changed group
 286     if (orderedGroups != null) {
 287       orderedGroups.add(group);
 288       assert orderedGroups.size() == topNGroups;
 289       final CollectedSearchGroup newLast = orderedGroups.last();
 290       // If we changed the value of the last group, or changed which group was last, then update bottom:
 291       if (group == newLast || prevLast != newLast) {
 292         for (FieldComparator fc : comparators) {
 293           fc.setBottom(newLast.comparatorSlot);
 294         }
 295       }
 296     }
 297   }
 298
 299   private void buildSortedSet() {
 300     final Comparator<CollectedSearchGroup> comparator = new Comparator<CollectedSearchGroup>() {
 301       public int compare(CollectedSearchGroup o1, CollectedSearchGroup o2) {
 302         for (int compIDX = 0;; compIDX++) {
 303           FieldComparator fc = comparators[compIDX];
 304           final int c = reversed[compIDX] * fc.compare(o1.comparatorSlot, o2.comparatorSlot);
 305           if (c != 0) {
 306             return c;
 307           } else if (compIDX == compIDXEnd) {
 308             return o1.topDoc - o2.topDoc;
 309           }
 310         }
 311       }
 312     };
 313
 314     orderedGroups = new TreeSet<CollectedSearchGroup<GROUP_VALUE_TYPE>>(comparator);
 315     orderedGroups.addAll(groupMap.values());
 316     assert orderedGroups.size() > 0;
 317
 318     for (FieldComparator fc : comparators) {
 319       fc.setBottom(orderedGroups.last().comparatorSlot);
 320     }
 321   }
 322
 323   @Override
 324   public boolean acceptsDocsOutOfOrder() {
 325     return false;
 326   }
 327
 328   @Override
 329   public void setNextReader(IndexReader reader, int docBase) throws IOException {
 330     this.docBase = docBase;
 331     for (int i=0; i<comparators.length; i++) {
 332       comparators[i].setNextReader(reader, docBase);
 333     }
 334   }
 335
 336   /**
 337    * Returns the group value for the specified doc.
 338    *
 339    * @param doc The specified doc
 340    * @return the group value for the specified doc
 341    */
 342   protected abstract GROUP_VALUE_TYPE getDocGroupValue(int doc);
 343
 344   /**
 345    * Returns a copy of the specified group value by creating a new instance and copying the value from the specified
 346    * groupValue in the new instance. Or optionally the reuse argument can be used to copy the group value in.
 347    *
 348    * @param groupValue The group value to copy
 349    * @param reuse Optionally a reuse instance to prevent a new instance creation
 350    * @return a copy of the specified group value
 351    */
 352   protected abstract GROUP_VALUE_TYPE copyDocGroupValue(GROUP_VALUE_TYPE groupValue, GROUP_VALUE_TYPE reuse);
 353 }
 354
 355 class CollectedSearchGroup<T> extends SearchGroup<T> {
 356   int topDoc;
 357   int comparatorSlot;
 358 }