1 package org.apache.lucene.index;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 import java.io.IOException;
22 import java.util.Collections;
26 * Merge policy that tries to balance not doing large
27 * segment merges with not accumulating too many segments in
28 * the index, to provide for better performance in near
31 * <p>This is based on code from zoie, described in more detail
32 * at http://code.google.com/p/zoie/wiki/ZoieMergePolicy.</p>
34 public class BalancedSegmentMergePolicy extends LogByteSizeMergePolicy {
36 public static final int DEFAULT_NUM_LARGE_SEGMENTS = 10;
38 private boolean _partialExpunge = false;
39 private int _numLargeSegments = DEFAULT_NUM_LARGE_SEGMENTS;
40 private int _maxSmallSegments = 2 * LogMergePolicy.DEFAULT_MERGE_FACTOR;
41 private int _maxSegments = _numLargeSegments + _maxSmallSegments;
43 public BalancedSegmentMergePolicy() {
46 public void setMergePolicyParams(MergePolicyParams params) {
48 setPartialExpunge(params._doPartialExpunge);
49 setNumLargeSegments(params._numLargeSegments);
50 setMaxSmallSegments(params._maxSmallSegments);
51 setPartialExpunge(params._doPartialExpunge);
52 setMergeFactor(params._mergeFactor);
53 setUseCompoundFile(params._useCompoundFile);
54 setMaxMergeDocs(params._maxMergeDocs);
59 protected long size(SegmentInfo info) throws IOException {
60 long byteSize = info.sizeInBytes(true);
61 float delRatio = (info.docCount <= 0 ? 0.0f : ((float)info.getDelCount() / (float)info.docCount));
62 return (info.docCount <= 0 ? byteSize : (long)((1.0f - delRatio) * byteSize));
65 public void setPartialExpunge(boolean doPartialExpunge) {
66 _partialExpunge = doPartialExpunge;
69 public boolean getPartialExpunge() {
70 return _partialExpunge;
73 public void setNumLargeSegments(int numLargeSegments) {
74 if (numLargeSegments < 2) {
75 throw new IllegalArgumentException("numLargeSegments cannot be less than 2");
78 _numLargeSegments = numLargeSegments;
79 _maxSegments = _numLargeSegments + 2 * getMergeFactor();
82 public int getNumLargeSegments() {
83 return _numLargeSegments;
86 public void setMaxSmallSegments(int maxSmallSegments) {
87 if (maxSmallSegments < getMergeFactor()) {
88 throw new IllegalArgumentException("maxSmallSegments cannot be less than mergeFactor");
90 _maxSmallSegments = maxSmallSegments;
91 _maxSegments = _numLargeSegments + _maxSmallSegments;
94 public int getMaxSmallSegments() {
95 return _maxSmallSegments;
99 public void setMergeFactor(int mergeFactor) {
100 super.setMergeFactor(mergeFactor);
101 if (_maxSmallSegments < getMergeFactor()) {
102 _maxSmallSegments = getMergeFactor();
103 _maxSegments = _numLargeSegments + _maxSmallSegments;
108 public MergeSpecification findForcedMerges(SegmentInfos infos, int maxNumSegments, Map<SegmentInfo,Boolean> segmentsToMerge) throws IOException {
110 assert maxNumSegments > 0;
112 MergeSpecification spec = null;
114 if (!isMerged(infos, maxNumSegments, segmentsToMerge)) {
116 // Find the newest (rightmost) segment that needs to
117 // be merged (other segments may have been flushed
118 // since the merge started):
119 int last = infos.size();
122 final SegmentInfo info = infos.info(--last);
123 if (segmentsToMerge.containsKey(info)) {
131 if (maxNumSegments == 1) {
133 // Since we must merge down to 1 segment, the
135 if (last > 1 || !isMerged(infos.info(0))) {
137 spec = new MergeSpecification();
138 spec.add(new OneMerge(infos.asList().subList(0, last)));
140 } else if (last > maxNumSegments) {
142 // find most balanced merges
143 spec = findBalancedMerges(infos, last, maxNumSegments, _partialExpunge);
150 private MergeSpecification findBalancedMerges(SegmentInfos infos, int infoLen, int maxNumSegments, boolean partialExpunge)
152 if (infoLen <= maxNumSegments) return null;
154 MergeSpecification spec = new MergeSpecification();
156 // use Viterbi algorithm to find the best segmentation.
157 // we will try to minimize the size variance of resulting segments.
159 double[][] variance = createVarianceTable(infos, infoLen, maxNumSegments);
161 final int maxMergeSegments = infoLen - maxNumSegments + 1;
162 double[] sumVariance = new double[maxMergeSegments];
163 int[][] backLink = new int[maxNumSegments][maxMergeSegments];
165 for(int i = (maxMergeSegments - 1); i >= 0; i--) {
166 sumVariance[i] = variance[0][i];
169 for(int i = 1; i < maxNumSegments; i++) {
170 for(int j = (maxMergeSegments - 1); j >= 0; j--) {
171 double minV = Double.MAX_VALUE;
173 for(int k = j; k >= 0; k--) {
174 double v = sumVariance[k] + variance[i + k][j - k];
180 sumVariance[j] = minV;
181 backLink[i][j] = minK;
185 // now, trace back the back links to find all merges,
186 // also find a candidate for partial expunge if requested
187 int mergeEnd = infoLen;
188 int prev = maxMergeSegments - 1;
189 int expungeCandidate = -1;
191 for(int i = maxNumSegments - 1; i >= 0; i--) {
192 prev = backLink[i][prev];
193 int mergeStart = i + prev;
194 if((mergeEnd - mergeStart) > 1) {
195 spec.add(new OneMerge(infos.asList().subList(mergeStart, mergeEnd)));
198 SegmentInfo info = infos.info(mergeStart);
199 int delCount = info.getDelCount();
200 if(delCount > maxDelCount) {
201 expungeCandidate = mergeStart;
202 maxDelCount = delCount;
206 mergeEnd = mergeStart;
209 if(partialExpunge && maxDelCount > 0) {
211 spec.add(new OneMerge(Collections.singletonList(infos.info(expungeCandidate))));
217 private double[][] createVarianceTable(SegmentInfos infos, int last, int maxNumSegments) throws IOException {
218 int maxMergeSegments = last - maxNumSegments + 1;
219 double[][] variance = new double[last][maxMergeSegments];
221 // compute the optimal segment size
223 long[] sizeArr = new long[last];
224 for(int i = 0; i < sizeArr.length; i++) {
225 sizeArr[i] = size(infos.info(i));
226 optSize += sizeArr[i];
228 optSize = (optSize / maxNumSegments);
230 for(int i = 0; i < last; i++) {
232 for(int j = 0; j < maxMergeSegments; j++) {
234 size += sizeArr[i + j];
235 double residual = ((double)size/(double)optSize) - 1.0d;
236 variance[i][j] = residual * residual;
238 variance[i][j] = Double.NaN;
246 public MergeSpecification findForcedDeletesMerges(SegmentInfos infos)
247 throws CorruptIndexException, IOException {
248 final int numSegs = infos.size();
249 final int numLargeSegs = (numSegs < _numLargeSegments ? numSegs : _numLargeSegments);
250 MergeSpecification spec = null;
252 if(numLargeSegs < numSegs) {
253 // hack to create a shallow sub-range as SegmentInfos instance,
254 // it does not clone all metadata, but LogMerge does not need it
255 final SegmentInfos smallSegments = new SegmentInfos();
256 smallSegments.rollbackSegmentInfos(infos.asList().subList(numLargeSegs, numSegs));
257 spec = super.findForcedDeletesMerges(smallSegments);
260 if(spec == null) spec = new MergeSpecification();
261 for(int i = 0; i < numLargeSegs; i++) {
262 SegmentInfo info = infos.info(i);
263 if(info.hasDeletions()) {
264 spec.add(new OneMerge(Collections.singletonList(infos.info(i))));
271 public MergeSpecification findMerges(SegmentInfos infos) throws IOException {
272 final int numSegs = infos.size();
273 final int numLargeSegs = _numLargeSegments;
275 if (numSegs <= numLargeSegs) {
279 long totalLargeSegSize = 0;
280 long totalSmallSegSize = 0;
283 // compute the total size of large segments
284 for(int i = 0; i < numLargeSegs; i++) {
285 info = infos.info(i);
286 totalLargeSegSize += size(info);
288 // compute the total size of small segments
289 for(int i = numLargeSegs; i < numSegs; i++) {
290 info = infos.info(i);
291 totalSmallSegSize += size(info);
294 long targetSegSize = (totalLargeSegSize / (numLargeSegs - 1));
295 if(targetSegSize <= totalSmallSegSize) {
296 // the total size of small segments is big enough,
297 // promote the small segments to a large segment and do balanced merge,
299 if(totalSmallSegSize < targetSegSize * 2) {
300 MergeSpecification spec = findBalancedMerges(infos, numLargeSegs, (numLargeSegs - 1), _partialExpunge);
301 if(spec == null) spec = new MergeSpecification(); // should not happen
302 spec.add(new OneMerge(infos.asList().subList(numLargeSegs, numSegs)));
305 return findBalancedMerges(infos, numSegs, numLargeSegs, _partialExpunge);
307 } else if (_maxSegments < numSegs) {
308 // we have more than _maxSegments, merge small segments smaller than targetSegSize/4
309 MergeSpecification spec = new MergeSpecification();
310 int startSeg = numLargeSegs;
311 long sizeThreshold = (targetSegSize / 4);
312 while(startSeg < numSegs) {
313 info = infos.info(startSeg);
314 if(size(info) < sizeThreshold) break;
317 spec.add(new OneMerge(infos.asList().subList(startSeg, numSegs)));
320 // hack to create a shallow sub-range as SegmentInfos instance,
321 // it does not clone all metadata, but LogMerge does not need it
322 final SegmentInfos smallSegments = new SegmentInfos();
323 smallSegments.rollbackSegmentInfos(infos.asList().subList(numLargeSegs, numSegs));
324 MergeSpecification spec = super.findMerges(smallSegments);
326 if(_partialExpunge) {
327 OneMerge expunge = findOneSegmentToExpunge(infos, numLargeSegs);
328 if(expunge != null) {
329 if(spec == null) spec = new MergeSpecification();
337 private OneMerge findOneSegmentToExpunge(SegmentInfos infos, int maxNumSegments) throws IOException {
338 int expungeCandidate = -1;
341 for(int i = maxNumSegments - 1; i >= 0; i--) {
342 SegmentInfo info = infos.info(i);
343 int delCount = info.getDelCount();
344 if (delCount > maxDelCount) {
345 expungeCandidate = i;
346 maxDelCount = delCount;
349 if (maxDelCount > 0) {
350 return new OneMerge(Collections.singletonList(infos.info(expungeCandidate)));
356 public static class MergePolicyParams {
357 private int _numLargeSegments;
358 private int _maxSmallSegments;
359 private boolean _doPartialExpunge;
360 private int _mergeFactor;
361 private boolean _useCompoundFile;
362 private int _maxMergeDocs;
364 public MergePolicyParams() {
365 _useCompoundFile = true;
366 _doPartialExpunge = false;
367 _numLargeSegments = DEFAULT_NUM_LARGE_SEGMENTS;
368 _maxSmallSegments = 2 * LogMergePolicy.DEFAULT_MERGE_FACTOR;
369 _maxSmallSegments = _numLargeSegments + _maxSmallSegments;
370 _mergeFactor = LogMergePolicy.DEFAULT_MERGE_FACTOR;
371 _maxMergeDocs = LogMergePolicy.DEFAULT_MAX_MERGE_DOCS;
374 public void setNumLargeSegments(int numLargeSegments) {
375 _numLargeSegments = numLargeSegments;
378 public int getNumLargeSegments() {
379 return _numLargeSegments;
382 public void setMaxSmallSegments(int maxSmallSegments) {
383 _maxSmallSegments = maxSmallSegments;
386 public int getMaxSmallSegments() {
387 return _maxSmallSegments;
390 public void setPartialExpunge(boolean doPartialExpunge) {
391 _doPartialExpunge = doPartialExpunge;
394 public boolean getPartialExpunge() {
395 return _doPartialExpunge;
398 public void setMergeFactor(int mergeFactor) {
399 _mergeFactor = mergeFactor;
402 public int getMergeFactor() {
406 public void setMaxMergeDocs(int maxMergeDocs) {
407 _maxMergeDocs = maxMergeDocs;
410 public int getMaxMergeDocs() {
411 return _maxMergeDocs;
414 public void setUseCompoundFile(boolean useCompoundFile) {
415 _useCompoundFile = useCompoundFile;
418 public boolean isUseCompoundFile() {
419 return _useCompoundFile;