1 package org.apache.lucene.index;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
22 import java.util.Collection;
23 import java.util.Collections;
24 import java.util.HashSet;
25 import java.util.Comparator;
26 import java.util.List;
27 import java.util.ArrayList;
30 * Merges segments of approximately equal size, subject to
31 * an allowed number of segments per tier. This is similar
32 * to {@link LogByteSizeMergePolicy}, except this merge
33 * policy is able to merge non-adjacent segment, and
34 * separates how many segments are merged at once ({@link
35 * #setMaxMergeAtOnce}) from how many segments are allowed
36 * per tier ({@link #setSegmentsPerTier}). This merge
37 * policy also does not over-merge (ie, cascade merges).
39 * <p>For normal merging, this policy first computes a
40 * "budget" of how many segments are allowed by be in the
41 * index. If the index is over-budget, then the policy
42 * sorts segments by decresing size (pro-rating by percent
43 * deletes), and then finds the least-cost merge. Merge
44 * cost is measured by a combination of the "skew" of the
45 * merge (size of largest seg divided by smallest seg),
46 * total merge size and pct deletes reclaimed,
47 * so that merges with lower skew, smaller size
48 * and those reclaiming more deletes, are
51 * <p>If a merge will produce a segment that's larger than
52 * {@link #setMaxMergedSegmentMB}, then the policy will
53 * merge fewer segments (down to 1 at once, if that one has
54 * deletions) to keep the segment size under budget.
56 * <p<b>NOTE</b>: this policy freely merges non-adjacent
57 * segments; if this is a problem, use {@link
60 * <p><b>NOTE</b>: This policy always merges by byte size
61 * of the segments, always pro-rates by percent deletes,
62 * and does not apply any maximum segment size during
63 * optimize (unlike {@link LogByteSizeMergePolicy}.
65 * @lucene.experimental
69 // - we could try to take into account whether a large
70 // merge is already running (under CMS) and then bias
71 // ourselves towards picking smaller merges if so (or,
72 // maybe CMS should do so)
74 public class TieredMergePolicy extends MergePolicy {
76 private int maxMergeAtOnce = 10;
77 private long maxMergedSegmentBytes = 5*1024*1024*1024L;
78 private int maxMergeAtOnceExplicit = 30;
80 private long floorSegmentBytes = 2*1024*1024L;
81 private double segsPerTier = 10.0;
82 private double expungeDeletesPctAllowed = 10.0;
83 private boolean useCompoundFile = true;
84 private double noCFSRatio = 0.1;
85 private double reclaimDeletesWeight = 2.0;
87 /** Maximum number of segments to be merged at a time
88 * during "normal" merging. For explicit merging (eg,
89 * optimize or expungeDeletes was called), see {@link
90 * #setMaxMergeAtOnceExplicit}. Default is 10. */
91 public TieredMergePolicy setMaxMergeAtOnce(int v) {
93 throw new IllegalArgumentException("maxMergeAtOnce must be > 1 (got " + v + ")");
99 /** @see #setMaxMergeAtOnce */
100 public int getMaxMergeAtOnce() {
101 return maxMergeAtOnce;
104 // TODO: should addIndexes do explicit merging, too? And,
105 // if user calls IW.maybeMerge "explicitly"
107 /** Maximum number of segments to be merged at a time,
108 * during optimize or expungeDeletes. Default is 30. */
109 public TieredMergePolicy setMaxMergeAtOnceExplicit(int v) {
111 throw new IllegalArgumentException("maxMergeAtOnceExplicit must be > 1 (got " + v + ")");
113 maxMergeAtOnceExplicit = v;
117 /** @see #setMaxMergeAtOnceExplicit */
118 public int getMaxMergeAtOnceExplicit() {
119 return maxMergeAtOnceExplicit;
122 /** Maximum sized segment to produce during
123 * normal merging. This setting is approximate: the
124 * estimate of the merged segment size is made by summing
125 * sizes of to-be-merged segments (compensating for
126 * percent deleted docs). Default is 5 GB. */
127 public TieredMergePolicy setMaxMergedSegmentMB(double v) {
128 maxMergedSegmentBytes = (long) (v*1024*1024);
132 /** @see #getMaxMergedSegmentMB */
133 public double getMaxMergedSegmentMB() {
134 return maxMergedSegmentBytes/1024/1024.;
137 /** Controls how aggressively merges that reclaim more
138 * deletions are favored. Higher values favor selecting
139 * merges that reclaim deletions. A value of 0.0 means
140 * deletions don't impact merge selection. */
141 public TieredMergePolicy setReclaimDeletesWeight(double v) {
143 throw new IllegalArgumentException("reclaimDeletesWeight must be >= 0.0 (got " + v + ")");
145 reclaimDeletesWeight = v;
149 /** See {@link #setReclaimDeletesWeight}. */
150 public double getReclaimDeletesWeight() {
151 return reclaimDeletesWeight;
154 /** Segments smaller than this are "rounded up" to this
155 * size, ie treated as equal (floor) size for merge
156 * selection. This is to prevent frequent flushing of
157 * tiny segments from allowing a long tail in the index.
158 * Default is 2 MB. */
159 public TieredMergePolicy setFloorSegmentMB(double v) {
161 throw new IllegalArgumentException("floorSegmentMB must be >= 0.0 (got " + v + ")");
163 floorSegmentBytes = (long) (v*1024*1024);
167 /** @see #setFloorSegmentMB */
168 public double getFloorSegmentMB() {
169 return floorSegmentBytes/1024*1024.;
172 /** When expungeDeletes is called, we only merge away a
173 * segment if its delete percentage is over this
174 * threshold. Default is 10%. */
175 public TieredMergePolicy setExpungeDeletesPctAllowed(double v) {
176 if (v < 0.0 || v > 100.0) {
177 throw new IllegalArgumentException("expungeDeletesPctAllowed must be between 0.0 and 100.0 inclusive (got " + v + ")");
179 expungeDeletesPctAllowed = v;
183 /** @see #setExpungeDeletesPctAllowed */
184 public double getExpungeDeletesPctAllowed() {
185 return expungeDeletesPctAllowed;
188 /** Sets the allowed number of segments per tier. Smaller
189 * values mean more merging but fewer segments.
191 * <p><b>NOTE</b>: this value should be >= the {@link
192 * #setMaxMergeAtOnce} otherwise you'll force too much
193 * merging to occur.</p>
195 * <p>Default is 10.0.</p> */
196 public TieredMergePolicy setSegmentsPerTier(double v) {
198 throw new IllegalArgumentException("segmentsPerTier must be >= 2.0 (got " + v + ")");
204 /** @see #setSegmentsPerTier */
205 public double getSegmentsPerTier() {
209 /** Sets whether compound file format should be used for
210 * newly flushed and newly merged segments. Default
212 public TieredMergePolicy setUseCompoundFile(boolean useCompoundFile) {
213 this.useCompoundFile = useCompoundFile;
217 /** @see #setUseCompoundFile */
218 public boolean getUseCompoundFile() {
219 return useCompoundFile;
222 /** If a merged segment will be more than this percentage
223 * of the total size of the index, leave the segment as
224 * non-compound file even if compound file is enabled.
225 * Set to 1.0 to always use CFS regardless of merge
226 * size. Default is 0.1. */
227 public TieredMergePolicy setNoCFSRatio(double noCFSRatio) {
228 if (noCFSRatio < 0.0 || noCFSRatio > 1.0) {
229 throw new IllegalArgumentException("noCFSRatio must be 0.0 to 1.0 inclusive; got " + noCFSRatio);
231 this.noCFSRatio = noCFSRatio;
235 /** @see #setNoCFSRatio */
236 public double getNoCFSRatio() {
240 private class SegmentByteSizeDescending implements Comparator<SegmentInfo> {
241 public int compare(SegmentInfo o1, SegmentInfo o2) {
243 final long sz1 = size(o1);
244 final long sz2 = size(o2);
247 } else if (sz2 > sz1) {
250 return o1.name.compareTo(o2.name);
252 } catch (IOException ioe) {
253 throw new RuntimeException(ioe);
258 private final Comparator<SegmentInfo> segmentByteSizeDescending = new SegmentByteSizeDescending();
260 protected static abstract class MergeScore {
261 abstract double getScore();
262 abstract String getExplanation();
266 public MergeSpecification findMerges(SegmentInfos infos) throws IOException {
268 message("findMerges: " + infos.size() + " segments");
270 if (infos.size() == 0) {
273 final Collection<SegmentInfo> merging = writer.get().getMergingSegments();
274 final Collection<SegmentInfo> toBeMerged = new HashSet<SegmentInfo>();
276 final List<SegmentInfo> infosSorted = new ArrayList<SegmentInfo>(infos.asList());
277 Collections.sort(infosSorted, segmentByteSizeDescending);
279 // Compute total index bytes & print details about the index
280 long totIndexBytes = 0;
281 long minSegmentBytes = Long.MAX_VALUE;
282 for(SegmentInfo info : infosSorted) {
283 final long segBytes = size(info);
285 String extra = merging.contains(info) ? " [merging]" : "";
286 if (segBytes >= maxMergedSegmentBytes/2.0) {
287 extra += " [skip: too large]";
288 } else if (segBytes < floorSegmentBytes) {
289 extra += " [floored]";
291 message(" seg=" + writer.get().segString(info) + " size=" + String.format("%.3f", segBytes/1024/1024.) + " MB" + extra);
294 minSegmentBytes = Math.min(segBytes, minSegmentBytes);
295 // Accum total byte size
296 totIndexBytes += segBytes;
299 // If we have too-large segments, grace them out
300 // of the maxSegmentCount:
302 while (tooBigCount < infosSorted.size() && size(infosSorted.get(tooBigCount)) >= maxMergedSegmentBytes/2.0) {
303 totIndexBytes -= size(infosSorted.get(tooBigCount));
307 minSegmentBytes = floorSize(minSegmentBytes);
309 // Compute max allowed segs in the index
310 long levelSize = minSegmentBytes;
311 long bytesLeft = totIndexBytes;
312 double allowedSegCount = 0;
314 final double segCountLevel = bytesLeft / (double) levelSize;
315 if (segCountLevel < segsPerTier) {
316 allowedSegCount += Math.ceil(segCountLevel);
319 allowedSegCount += segsPerTier;
320 bytesLeft -= segsPerTier * levelSize;
321 levelSize *= maxMergeAtOnce;
323 int allowedSegCountInt = (int) allowedSegCount;
325 MergeSpecification spec = null;
327 // Cycle to possibly select more than one merge:
330 long mergingBytes = 0;
332 // Gather eligible segments for merging, ie segments
333 // not already being merged and not already picked (by
334 // prior iteration of this loop) for merging:
335 final List<SegmentInfo> eligible = new ArrayList<SegmentInfo>();
336 for(int idx = tooBigCount; idx<infosSorted.size(); idx++) {
337 final SegmentInfo info = infosSorted.get(idx);
338 if (merging.contains(info)) {
339 mergingBytes += info.sizeInBytes(true);
340 } else if (!toBeMerged.contains(info)) {
345 final boolean maxMergeIsRunning = mergingBytes >= maxMergedSegmentBytes;
347 message(" allowedSegmentCount=" + allowedSegCountInt + " vs count=" + infosSorted.size() + " (eligible count=" + eligible.size() + ") tooBigCount=" + tooBigCount);
349 if (eligible.size() == 0) {
353 if (eligible.size() >= allowedSegCountInt) {
355 // OK we are over budget -- find best merge!
356 MergeScore bestScore = null;
357 List<SegmentInfo> best = null;
358 boolean bestTooLarge = false;
359 long bestMergeBytes = 0;
361 // Consider all merge starts:
362 for(int startIdx = 0;startIdx <= eligible.size()-maxMergeAtOnce; startIdx++) {
364 long totAfterMergeBytes = 0;
366 final List<SegmentInfo> candidate = new ArrayList<SegmentInfo>();
367 boolean hitTooLarge = false;
368 for(int idx = startIdx;idx<eligible.size() && candidate.size() < maxMergeAtOnce;idx++) {
369 final SegmentInfo info = eligible.get(idx);
370 final long segBytes = size(info);
372 if (totAfterMergeBytes + segBytes > maxMergedSegmentBytes) {
374 // NOTE: we continue, so that we can try
375 // "packing" smaller segments into this merge
376 // to see if we can get closer to the max
377 // size; this in general is not perfect since
378 // this is really "bin packing" and we'd have
379 // to try different permutations.
383 totAfterMergeBytes += segBytes;
386 final MergeScore score = score(candidate, hitTooLarge, mergingBytes);
387 message(" maybe=" + writer.get().segString(candidate) + " score=" + score.getScore() + " " + score.getExplanation() + " tooLarge=" + hitTooLarge + " size=" + String.format("%.3f MB", totAfterMergeBytes/1024./1024.));
389 // If we are already running a max sized merge
390 // (maxMergeIsRunning), don't allow another max
391 // sized merge to kick off:
392 if ((bestScore == null || score.getScore() < bestScore.getScore()) && (!hitTooLarge || !maxMergeIsRunning)) {
395 bestTooLarge = hitTooLarge;
396 bestMergeBytes = totAfterMergeBytes;
402 spec = new MergeSpecification();
404 final OneMerge merge = new OneMerge(best);
406 for(SegmentInfo info : merge.segments) {
407 toBeMerged.add(info);
411 message(" add merge=" + writer.get().segString(merge.segments) + " size=" + String.format("%.3f MB", bestMergeBytes/1024./1024.) + " score=" + String.format("%.3f", bestScore.getScore()) + " " + bestScore.getExplanation() + (bestTooLarge ? " [max merge]" : ""));
422 /** Expert: scores one merge; subclasses can override. */
423 protected MergeScore score(List<SegmentInfo> candidate, boolean hitTooLarge, long mergingBytes) throws IOException {
424 long totBeforeMergeBytes = 0;
425 long totAfterMergeBytes = 0;
426 long totAfterMergeBytesFloored = 0;
427 for(SegmentInfo info : candidate) {
428 final long segBytes = size(info);
429 totAfterMergeBytes += segBytes;
430 totAfterMergeBytesFloored += floorSize(segBytes);
431 totBeforeMergeBytes += info.sizeInBytes(true);
434 // Measure "skew" of the merge, which can range
435 // from 1.0/numSegsBeingMerged (good) to 1.0
439 // Pretend the merge has perfect skew; skew doesn't
440 // matter in this case because this merge will not
441 // "cascade" and so it cannot lead to N^2 merge cost
443 skew = 1.0/maxMergeAtOnce;
445 skew = ((double) floorSize(size(candidate.get(0))))/totAfterMergeBytesFloored;
448 // Strongly favor merges with less skew (smaller
449 // mergeScore is better):
450 double mergeScore = skew;
452 // Gently favor smaller merges over bigger ones. We
453 // don't want to make this exponent too large else we
454 // can end up doing poor merges of small segments in
455 // order to avoid the large merges:
456 mergeScore *= Math.pow(totAfterMergeBytes, 0.05);
458 // Strongly favor merges that reclaim deletes:
459 final double nonDelRatio = ((double) totAfterMergeBytes)/totBeforeMergeBytes;
460 mergeScore *= Math.pow(nonDelRatio, reclaimDeletesWeight);
462 final double finalMergeScore = mergeScore;
464 return new MergeScore() {
467 public double getScore() {
468 return finalMergeScore;
472 public String getExplanation() {
473 return "skew=" + String.format("%.3f", skew) + " nonDelRatio=" + String.format("%.3f", nonDelRatio);
479 public MergeSpecification findMergesForOptimize(SegmentInfos infos, int maxSegmentCount, Map<SegmentInfo,Boolean> segmentsToOptimize) throws IOException {
481 message("findMergesForOptimize maxSegmentCount=" + maxSegmentCount + " infos=" + writer.get().segString(infos) + " segmentsToOptimize=" + segmentsToOptimize);
484 List<SegmentInfo> eligible = new ArrayList<SegmentInfo>();
485 boolean optimizeMergeRunning = false;
486 final Collection<SegmentInfo> merging = writer.get().getMergingSegments();
487 boolean segmentIsOriginal = false;
488 for(SegmentInfo info : infos) {
489 final Boolean isOriginal = segmentsToOptimize.get(info);
490 if (isOriginal != null) {
491 segmentIsOriginal = isOriginal;
492 if (!merging.contains(info)) {
495 optimizeMergeRunning = true;
500 if (eligible.size() == 0) {
504 if ((maxSegmentCount > 1 && eligible.size() <= maxSegmentCount) ||
505 (maxSegmentCount == 1 && eligible.size() == 1 && (!segmentIsOriginal || isOptimized(eligible.get(0))))) {
507 message("already optimized");
512 Collections.sort(eligible, segmentByteSizeDescending);
515 message("eligible=" + eligible);
516 message("optimizeMergeRunning=" + optimizeMergeRunning);
519 int end = eligible.size();
521 MergeSpecification spec = null;
523 // Do full merges, first, backwards:
524 while(end >= maxMergeAtOnceExplicit + maxSegmentCount - 1) {
526 spec = new MergeSpecification();
528 final OneMerge merge = new OneMerge(eligible.subList(end-maxMergeAtOnceExplicit, end));
530 message("add merge=" + writer.get().segString(merge.segments));
533 end -= maxMergeAtOnceExplicit;
536 if (spec == null && !optimizeMergeRunning) {
538 final int numToMerge = end - maxSegmentCount + 1;
539 final OneMerge merge = new OneMerge(eligible.subList(end-numToMerge, end));
541 message("add final merge=" + merge.segString(writer.get().getDirectory()));
543 spec = new MergeSpecification();
551 public MergeSpecification findMergesToExpungeDeletes(SegmentInfos infos)
552 throws CorruptIndexException, IOException {
554 message("findMergesToExpungeDeletes infos=" + writer.get().segString(infos) + " expungeDeletesPctAllowed=" + expungeDeletesPctAllowed);
556 final List<SegmentInfo> eligible = new ArrayList<SegmentInfo>();
557 final Collection<SegmentInfo> merging = writer.get().getMergingSegments();
558 for(SegmentInfo info : infos) {
559 double pctDeletes = 100.*((double) writer.get().numDeletedDocs(info))/info.docCount;
560 if (pctDeletes > expungeDeletesPctAllowed && !merging.contains(info)) {
565 if (eligible.size() == 0) {
569 Collections.sort(eligible, segmentByteSizeDescending);
572 message("eligible=" + eligible);
576 MergeSpecification spec = null;
578 while(start < eligible.size()) {
579 long totAfterMergeBytes = 0;
581 boolean done = false;
582 while(upto < start + maxMergeAtOnceExplicit) {
583 if (upto == eligible.size()) {
587 final SegmentInfo info = eligible.get(upto);
588 final long segBytes = size(info);
589 if (totAfterMergeBytes + segBytes > maxMergedSegmentBytes) {
590 // TODO: we could be smarter here, eg cherry
591 // picking smaller merges that'd sum up to just
592 // around the max size
595 totAfterMergeBytes += segBytes;
600 // Single segment is too big; grace it
606 spec = new MergeSpecification();
609 final OneMerge merge = new OneMerge(eligible.subList(start, upto));
611 message("add merge=" + writer.get().segString(merge.segments));
624 public boolean useCompoundFile(SegmentInfos infos, SegmentInfo mergedInfo) throws IOException {
627 if (!useCompoundFile) {
629 } else if (noCFSRatio == 1.0) {
633 for (SegmentInfo info : infos)
634 totalSize += size(info);
636 doCFS = size(mergedInfo) <= noCFSRatio * totalSize;
642 public void close() {
645 private boolean isOptimized(SegmentInfo info)
647 IndexWriter w = writer.get();
649 boolean hasDeletions = w.numDeletedDocs(info) > 0;
650 return !hasDeletions &&
651 !info.hasSeparateNorms() &&
652 info.dir == w.getDirectory() &&
653 (info.getUseCompoundFile() == useCompoundFile || noCFSRatio < 1.0);
656 // Segment size in bytes, pro-rated by % deleted
657 private long size(SegmentInfo info) throws IOException {
658 final long byteSize = info.sizeInBytes(true);
659 final int delCount = writer.get().numDeletedDocs(info);
660 final double delRatio = (info.docCount <= 0 ? 0.0f : ((double)delCount / (double)info.docCount));
661 assert delRatio <= 1.0;
662 return (long) (byteSize * (1.0-delRatio));
665 private long floorSize(long bytes) {
666 return Math.max(floorSegmentBytes, bytes);
669 private boolean verbose() {
670 IndexWriter w = writer.get();
671 return w != null && w.verbose();
674 private void message(String message) {
676 writer.get().message("TMP: " + message);
681 public String toString() {
682 StringBuilder sb = new StringBuilder("[" + getClass().getSimpleName() + ": ");
683 sb.append("maxMergeAtOnce=").append(maxMergeAtOnce).append(", ");
684 sb.append("maxMergeAtOnceExplicit=").append(maxMergeAtOnceExplicit).append(", ");
685 sb.append("maxMergedSegmentMB=").append(maxMergedSegmentBytes/1024/1024.).append(", ");
686 sb.append("floorSegmentMB=").append(floorSegmentBytes/1024/1024.).append(", ");
687 sb.append("expungeDeletesPctAllowed=").append(expungeDeletesPctAllowed).append(", ");
688 sb.append("segmentsPerTier=").append(segsPerTier).append(", ");
689 sb.append("useCompoundFile=").append(useCompoundFile).append(", ");
690 sb.append("noCFSRatio=").append(noCFSRatio);
691 return sb.toString();