1 package org.apache.lucene.index;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.util.ArrayList;
22 import java.util.Collection;
23 import java.util.List;
26 /** <p>This class implements a {@link MergePolicy} that tries
27 * to merge segments into levels of exponentially
28 * increasing size, where each level has fewer segments than
29 * the value of the merge factor. Whenever extra segments
30 * (beyond the merge factor upper bound) are encountered,
31 * all segments within the level are merged. You can get or
32 * set the merge factor using {@link #getMergeFactor()} and
33 * {@link #setMergeFactor(int)} respectively.</p>
35 * <p>This class is abstract and requires a subclass to
36 * define the {@link #size} method which specifies how a
37 * segment's size is determined. {@link LogDocMergePolicy}
38 * is one subclass that measures size by document count in
39 * the segment. {@link LogByteSizeMergePolicy} is another
40 * subclass that measures size as the total byte size of the
41 * file(s) for the segment.</p>
44 public abstract class LogMergePolicy extends MergePolicy {
46 /** Defines the allowed range of log(size) for each
47 * level. A level is computed by taking the max segment
48 * log size, minus LEVEL_LOG_SPAN, and finding all
49 * segments falling within that range. */
50 public static final double LEVEL_LOG_SPAN = 0.75;
52 /** Default merge factor, which is how many segments are
54 public static final int DEFAULT_MERGE_FACTOR = 10;
56 /** Default maximum segment size. A segment of this size
57 * or larger will never be merged. @see setMaxMergeDocs */
58 public static final int DEFAULT_MAX_MERGE_DOCS = Integer.MAX_VALUE;
60 /** Default noCFSRatio. If a merge's size is >= 10% of
61 * the index, then we disable compound file for it.
62 * @see #setNoCFSRatio */
63 public static final double DEFAULT_NO_CFS_RATIO = 0.1;
65 protected int mergeFactor = DEFAULT_MERGE_FACTOR;
67 protected long minMergeSize;
68 protected long maxMergeSize;
69 // Although the core MPs set it explicitly, we must default in case someone
70 // out there wrote his own LMP ...
71 protected long maxMergeSizeForOptimize = Long.MAX_VALUE;
72 protected int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS;
74 protected double noCFSRatio = DEFAULT_NO_CFS_RATIO;
76 protected boolean calibrateSizeByDeletes = true;
78 protected boolean useCompoundFile = true;
80 public LogMergePolicy() {
84 protected boolean verbose() {
85 IndexWriter w = writer.get();
86 return w != null && w.verbose();
89 /** @see #setNoCFSRatio */
90 public double getNoCFSRatio() {
94 /** If a merged segment will be more than this percentage
95 * of the total size of the index, leave the segment as
96 * non-compound file even if compound file is enabled.
97 * Set to 1.0 to always use CFS regardless of merge
99 public void setNoCFSRatio(double noCFSRatio) {
100 if (noCFSRatio < 0.0 || noCFSRatio > 1.0) {
101 throw new IllegalArgumentException("noCFSRatio must be 0.0 to 1.0 inclusive; got " + noCFSRatio);
103 this.noCFSRatio = noCFSRatio;
106 protected void message(String message) {
108 writer.get().message("LMP: " + message);
111 /** <p>Returns the number of segments that are merged at
112 * once and also controls the total number of segments
113 * allowed to accumulate in the index.</p> */
114 public int getMergeFactor() {
118 /** Determines how often segment indices are merged by
119 * addDocument(). With smaller values, less RAM is used
120 * while indexing, and searches on unoptimized indices are
121 * faster, but indexing speed is slower. With larger
122 * values, more RAM is used during indexing, and while
123 * searches on unoptimized indices are slower, indexing is
124 * faster. Thus larger values (> 10) are best for batch
125 * index creation, and smaller values (< 10) for indices
126 * that are interactively maintained. */
127 public void setMergeFactor(int mergeFactor) {
129 throw new IllegalArgumentException("mergeFactor cannot be less than 2");
130 this.mergeFactor = mergeFactor;
135 public boolean useCompoundFile(SegmentInfos infos, SegmentInfo mergedInfo) throws IOException {
138 if (!useCompoundFile) {
140 } else if (noCFSRatio == 1.0) {
144 for (SegmentInfo info : infos)
145 totalSize += size(info);
147 doCFS = size(mergedInfo) <= noCFSRatio * totalSize;
152 /** Sets whether compound file format should be used for
153 * newly flushed and newly merged segments. */
154 public void setUseCompoundFile(boolean useCompoundFile) {
155 this.useCompoundFile = useCompoundFile;
158 /** Returns true if newly flushed and newly merge segments
159 * are written in compound file format. @see
160 * #setUseCompoundFile */
161 public boolean getUseCompoundFile() {
162 return useCompoundFile;
165 /** Sets whether the segment size should be calibrated by
166 * the number of deletes when choosing segments for merge. */
167 public void setCalibrateSizeByDeletes(boolean calibrateSizeByDeletes) {
168 this.calibrateSizeByDeletes = calibrateSizeByDeletes;
171 /** Returns true if the segment size should be calibrated
172 * by the number of deletes when choosing segments for merge. */
173 public boolean getCalibrateSizeByDeletes() {
174 return calibrateSizeByDeletes;
178 public void close() {}
180 abstract protected long size(SegmentInfo info) throws IOException;
182 protected long sizeDocs(SegmentInfo info) throws IOException {
183 if (calibrateSizeByDeletes) {
184 int delCount = writer.get().numDeletedDocs(info);
185 assert delCount <= info.docCount;
186 return (info.docCount - (long)delCount);
188 return info.docCount;
192 protected long sizeBytes(SegmentInfo info) throws IOException {
193 long byteSize = info.sizeInBytes(true);
194 if (calibrateSizeByDeletes) {
195 int delCount = writer.get().numDeletedDocs(info);
196 double delRatio = (info.docCount <= 0 ? 0.0f : ((float)delCount / (float)info.docCount));
197 assert delRatio <= 1.0;
198 return (info.docCount <= 0 ? byteSize : (long)(byteSize * (1.0 - delRatio)));
204 protected boolean isOptimized(SegmentInfos infos, int maxNumSegments, Map<SegmentInfo,Boolean> segmentsToOptimize) throws IOException {
205 final int numSegments = infos.size();
206 int numToOptimize = 0;
207 SegmentInfo optimizeInfo = null;
208 boolean segmentIsOriginal = false;
209 for(int i=0;i<numSegments && numToOptimize <= maxNumSegments;i++) {
210 final SegmentInfo info = infos.info(i);
211 final Boolean isOriginal = segmentsToOptimize.get(info);
212 if (isOriginal != null) {
213 segmentIsOriginal = isOriginal;
219 return numToOptimize <= maxNumSegments &&
220 (numToOptimize != 1 || !segmentIsOriginal || isOptimized(optimizeInfo));
223 /** Returns true if this single info is optimized (has no
224 * pending norms or deletes, is in the same dir as the
225 * writer, and matches the current compound file setting */
226 protected boolean isOptimized(SegmentInfo info)
228 IndexWriter w = writer.get();
230 boolean hasDeletions = w.numDeletedDocs(info) > 0;
231 return !hasDeletions &&
232 !info.hasSeparateNorms() &&
233 info.dir == w.getDirectory() &&
234 (info.getUseCompoundFile() == useCompoundFile || noCFSRatio < 1.0);
238 * Returns the merges necessary to optimize the index, taking the max merge
239 * size or max merge docs into consideration. This method attempts to respect
240 * the {@code maxNumSegments} parameter, however it might be, due to size
241 * constraints, that more than that number of segments will remain in the
242 * index. Also, this method does not guarantee that exactly {@code
243 * maxNumSegments} will remain, but <= that number.
245 private MergeSpecification findMergesForOptimizeSizeLimit(
246 SegmentInfos infos, int maxNumSegments, int last) throws IOException {
247 MergeSpecification spec = new MergeSpecification();
248 final List<SegmentInfo> segments = infos.asList();
250 int start = last - 1;
252 SegmentInfo info = infos.info(start);
253 if (size(info) > maxMergeSizeForOptimize || sizeDocs(info) > maxMergeDocs) {
255 message("optimize: skip segment=" + info + ": size is > maxMergeSize (" + maxMergeSizeForOptimize + ") or sizeDocs is > maxMergeDocs (" + maxMergeDocs + ")");
257 // need to skip that segment + add a merge for the 'right' segments,
258 // unless there is only 1 which is optimized.
259 if (last - start - 1 > 1 || (start != last - 1 && !isOptimized(infos.info(start + 1)))) {
260 // there is more than 1 segment to the right of this one, or an unoptimized single segment.
261 spec.add(new OneMerge(segments.subList(start + 1, last)));
264 } else if (last - start == mergeFactor) {
265 // mergeFactor eligible segments were found, add them as a merge.
266 spec.add(new OneMerge(segments.subList(start, last)));
272 // Add any left-over segments, unless there is just 1 already optimized.
273 if (last > 0 && (++start + 1 < last || !isOptimized(infos.info(start)))) {
274 spec.add(new OneMerge(segments.subList(start, last)));
277 return spec.merges.size() == 0 ? null : spec;
281 * Returns the merges necessary to optimize the index. This method constraints
282 * the returned merges only by the {@code maxNumSegments} parameter, and
283 * guaranteed that exactly that number of segments will remain in the index.
285 private MergeSpecification findMergesForOptimizeMaxNumSegments(SegmentInfos infos, int maxNumSegments, int last) throws IOException {
286 MergeSpecification spec = new MergeSpecification();
287 final List<SegmentInfo> segments = infos.asList();
289 // First, enroll all "full" merges (size
290 // mergeFactor) to potentially be run concurrently:
291 while (last - maxNumSegments + 1 >= mergeFactor) {
292 spec.add(new OneMerge(segments.subList(last - mergeFactor, last)));
296 // Only if there are no full merges pending do we
297 // add a final partial (< mergeFactor segments) merge:
298 if (0 == spec.merges.size()) {
299 if (maxNumSegments == 1) {
301 // Since we must optimize down to 1 segment, the
303 if (last > 1 || !isOptimized(infos.info(0))) {
304 spec.add(new OneMerge(segments.subList(0, last)));
306 } else if (last > maxNumSegments) {
308 // Take care to pick a partial merge that is
309 // least cost, but does not make the index too
310 // lopsided. If we always just picked the
311 // partial tail then we could produce a highly
312 // lopsided index over time:
314 // We must merge this many segments to leave
315 // maxNumSegments in the index (from when
316 // optimize was first kicked off):
317 final int finalMergeSize = last - maxNumSegments + 1;
319 // Consider all possible starting points:
323 for(int i=0;i<last-finalMergeSize+1;i++) {
325 for(int j=0;j<finalMergeSize;j++)
326 sumSize += size(infos.info(j+i));
327 if (i == 0 || (sumSize < 2*size(infos.info(i-1)) && sumSize < bestSize)) {
333 spec.add(new OneMerge(segments.subList(bestStart, bestStart + finalMergeSize)));
336 return spec.merges.size() == 0 ? null : spec;
339 /** Returns the merges necessary to optimize the index.
340 * This merge policy defines "optimized" to mean only the
341 * requested number of segments is left in the index, and
342 * respects the {@link #maxMergeSizeForOptimize} setting.
343 * By default, and assuming {@code maxNumSegments=1}, only
344 * one segment will be left in the index, where that segment
345 * has no deletions pending nor separate norms, and it is in
346 * compound file format if the current useCompoundFile
347 * setting is true. This method returns multiple merges
348 * (mergeFactor at a time) so the {@link MergeScheduler}
349 * in use may make use of concurrency. */
351 public MergeSpecification findMergesForOptimize(SegmentInfos infos,
352 int maxNumSegments, Map<SegmentInfo,Boolean> segmentsToOptimize) throws IOException {
354 assert maxNumSegments > 0;
356 message("findMergesForOptimize: maxNumSegs=" + maxNumSegments + " segsToOptimize="+ segmentsToOptimize);
359 // If the segments are already optimized (e.g. there's only 1 segment), or
360 // there are <maxNumSegements, all optimized, nothing to do.
361 if (isOptimized(infos, maxNumSegments, segmentsToOptimize)) return null;
363 // Find the newest (rightmost) segment that needs to
364 // be optimized (other segments may have been flushed
365 // since optimize started):
366 int last = infos.size();
368 final SegmentInfo info = infos.info(--last);
369 if (segmentsToOptimize.get(info) != null) {
375 if (last == 0) return null;
377 // There is only one segment already, and it is optimized
378 if (maxNumSegments == 1 && last == 1 && isOptimized(infos.info(0))) return null;
380 // Check if there are any segments above the threshold
381 boolean anyTooLarge = false;
382 for (int i = 0; i < last; i++) {
383 SegmentInfo info = infos.info(i);
384 if (size(info) > maxMergeSizeForOptimize || sizeDocs(info) > maxMergeDocs) {
391 return findMergesForOptimizeSizeLimit(infos, maxNumSegments, last);
393 return findMergesForOptimizeMaxNumSegments(infos, maxNumSegments, last);
398 * Finds merges necessary to expunge all deletes from the
399 * index. We simply merge adjacent segments that have
400 * deletes, up to mergeFactor at a time.
403 public MergeSpecification findMergesToExpungeDeletes(SegmentInfos segmentInfos)
404 throws CorruptIndexException, IOException {
405 final List<SegmentInfo> segments = segmentInfos.asList();
406 final int numSegments = segments.size();
409 message("findMergesToExpungeDeletes: " + numSegments + " segments");
411 MergeSpecification spec = new MergeSpecification();
412 int firstSegmentWithDeletions = -1;
413 IndexWriter w = writer.get();
415 for(int i=0;i<numSegments;i++) {
416 final SegmentInfo info = segmentInfos.info(i);
417 int delCount = w.numDeletedDocs(info);
420 message(" segment " + info.name + " has deletions");
421 if (firstSegmentWithDeletions == -1)
422 firstSegmentWithDeletions = i;
423 else if (i - firstSegmentWithDeletions == mergeFactor) {
424 // We've seen mergeFactor segments in a row with
425 // deletions, so force a merge now:
427 message(" add merge " + firstSegmentWithDeletions + " to " + (i-1) + " inclusive");
428 spec.add(new OneMerge(segments.subList(firstSegmentWithDeletions, i)));
429 firstSegmentWithDeletions = i;
431 } else if (firstSegmentWithDeletions != -1) {
432 // End of a sequence of segments with deletions, so,
433 // merge those past segments even if it's fewer than
434 // mergeFactor segments
436 message(" add merge " + firstSegmentWithDeletions + " to " + (i-1) + " inclusive");
437 spec.add(new OneMerge(segments.subList(firstSegmentWithDeletions, i)));
438 firstSegmentWithDeletions = -1;
442 if (firstSegmentWithDeletions != -1) {
444 message(" add merge " + firstSegmentWithDeletions + " to " + (numSegments-1) + " inclusive");
445 spec.add(new OneMerge(segments.subList(firstSegmentWithDeletions, numSegments)));
451 private static class SegmentInfoAndLevel implements Comparable<SegmentInfoAndLevel> {
456 public SegmentInfoAndLevel(SegmentInfo info, float level, int index) {
462 // Sorts largest to smallest
463 public int compareTo(SegmentInfoAndLevel other) {
464 if (level < other.level)
466 else if (level > other.level)
473 /** Checks if any merges are now necessary and returns a
474 * {@link MergePolicy.MergeSpecification} if so. A merge
475 * is necessary when there are more than {@link
476 * #setMergeFactor} segments at a given level. When
477 * multiple levels have too many segments, this method
478 * will return multiple merges, allowing the {@link
479 * MergeScheduler} to use concurrency. */
481 public MergeSpecification findMerges(SegmentInfos infos) throws IOException {
483 final int numSegments = infos.size();
485 message("findMerges: " + numSegments + " segments");
487 // Compute levels, which is just log (base mergeFactor)
488 // of the size of each segment
489 final List<SegmentInfoAndLevel> levels = new ArrayList<SegmentInfoAndLevel>();
490 final float norm = (float) Math.log(mergeFactor);
492 final Collection<SegmentInfo> mergingSegments = writer.get().getMergingSegments();
494 for(int i=0;i<numSegments;i++) {
495 final SegmentInfo info = infos.info(i);
496 long size = size(info);
498 // Floor tiny segments
503 final SegmentInfoAndLevel infoLevel = new SegmentInfoAndLevel(info, (float) Math.log(size)/norm, i);
504 levels.add(infoLevel);
507 final long segBytes = sizeBytes(info);
508 String extra = mergingSegments.contains(info) ? " [merging]" : "";
509 if (size >= maxMergeSize) {
510 extra += " [skip: too large]";
512 message("seg=" + writer.get().segString(info) + " level=" + infoLevel.level + " size=" + String.format("%.3f MB", segBytes/1024/1024.) + extra);
516 final float levelFloor;
517 if (minMergeSize <= 0)
518 levelFloor = (float) 0.0;
520 levelFloor = (float) (Math.log(minMergeSize)/norm);
522 // Now, we quantize the log values into levels. The
523 // first level is any segment whose log size is within
524 // LEVEL_LOG_SPAN of the max size, or, who has such as
525 // segment "to the right". Then, we find the max of all
526 // other segments and use that to define the next level
529 MergeSpecification spec = null;
531 final int numMergeableSegments = levels.size();
534 while(start < numMergeableSegments) {
536 // Find max level of all segments not already
538 float maxLevel = levels.get(start).level;
539 for(int i=1+start;i<numMergeableSegments;i++) {
540 final float level = levels.get(i).level;
541 if (level > maxLevel)
545 // Now search backwards for the rightmost segment that
546 // falls into this level:
548 if (maxLevel <= levelFloor)
549 // All remaining segments fall into the min level
552 levelBottom = (float) (maxLevel - LEVEL_LOG_SPAN);
554 // Force a boundary at the level floor
555 if (levelBottom < levelFloor && maxLevel >= levelFloor)
556 levelBottom = levelFloor;
559 int upto = numMergeableSegments-1;
560 while(upto >= start) {
561 if (levels.get(upto).level >= levelBottom) {
567 message(" level " + levelBottom + " to " + maxLevel + ": " + (1+upto-start) + " segments");
569 // Finally, record all merges that are viable at this level:
570 int end = start + mergeFactor;
571 while(end <= 1+upto) {
572 boolean anyTooLarge = false;
573 boolean anyMerging = false;
574 for(int i=start;i<end;i++) {
575 final SegmentInfo info = levels.get(i).info;
576 anyTooLarge |= (size(info) >= maxMergeSize || sizeDocs(info) >= maxMergeDocs);
577 if (mergingSegments.contains(info)) {
585 } else if (!anyTooLarge) {
587 spec = new MergeSpecification();
588 final List<SegmentInfo> mergeInfos = new ArrayList<SegmentInfo>();
589 for(int i=start;i<end;i++) {
590 mergeInfos.add(levels.get(i).info);
591 assert infos.contains(levels.get(i).info);
594 message(" add merge=" + writer.get().segString(mergeInfos) + " start=" + start + " end=" + end);
596 spec.add(new OneMerge(mergeInfos));
597 } else if (verbose()) {
598 message(" " + start + " to " + end + ": contains segment over maxMergeSize or maxMergeDocs; skipping");
602 end = start + mergeFactor;
611 /** <p>Determines the largest segment (measured by
612 * document count) that may be merged with other segments.
613 * Small values (e.g., less than 10,000) are best for
614 * interactive indexing, as this limits the length of
615 * pauses while indexing to a few seconds. Larger values
616 * are best for batched indexing and speedier
619 * <p>The default value is {@link Integer#MAX_VALUE}.</p>
621 * <p>The default merge policy ({@link
622 * LogByteSizeMergePolicy}) also allows you to set this
623 * limit by net size (in MB) of the segment, using {@link
624 * LogByteSizeMergePolicy#setMaxMergeMB}.</p>
626 public void setMaxMergeDocs(int maxMergeDocs) {
627 this.maxMergeDocs = maxMergeDocs;
630 /** Returns the largest segment (measured by document
631 * count) that may be merged with other segments.
632 * @see #setMaxMergeDocs */
633 public int getMaxMergeDocs() {
638 public String toString() {
639 StringBuilder sb = new StringBuilder("[" + getClass().getSimpleName() + ": ");
640 sb.append("minMergeSize=").append(minMergeSize).append(", ");
641 sb.append("mergeFactor=").append(mergeFactor).append(", ");
642 sb.append("maxMergeSize=").append(maxMergeSize).append(", ");
643 sb.append("maxMergeSizeForOptimize=").append(maxMergeSizeForOptimize).append(", ");
644 sb.append("calibrateSizeByDeletes=").append(calibrateSizeByDeletes).append(", ");
645 sb.append("maxMergeDocs=").append(maxMergeDocs).append(", ");
646 sb.append("useCompoundFile=").append(useCompoundFile).append(", ");
647 sb.append("noCFSRatio=").append(noCFSRatio);
649 return sb.toString();