+++ /dev/null
-package org.apache.lucene.index;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-import java.util.Map;
-
-/** <p>This class implements a {@link MergePolicy} that tries
- * to merge segments into levels of exponentially
- * increasing size, where each level has fewer segments than
- * the value of the merge factor. Whenever extra segments
- * (beyond the merge factor upper bound) are encountered,
- * all segments within the level are merged. You can get or
- * set the merge factor using {@link #getMergeFactor()} and
- * {@link #setMergeFactor(int)} respectively.</p>
- *
- * <p>This class is abstract and requires a subclass to
- * define the {@link #size} method which specifies how a
- * segment's size is determined. {@link LogDocMergePolicy}
- * is one subclass that measures size by document count in
- * the segment. {@link LogByteSizeMergePolicy} is another
- * subclass that measures size as the total byte size of the
- * file(s) for the segment.</p>
- */
-
-public abstract class LogMergePolicy extends MergePolicy {
-
- /** Defines the allowed range of log(size) for each
- * level. A level is computed by taking the max segment
- * log size, minus LEVEL_LOG_SPAN, and finding all
- * segments falling within that range. */
- public static final double LEVEL_LOG_SPAN = 0.75;
-
- /** Default merge factor, which is how many segments are
- * merged at a time */
- public static final int DEFAULT_MERGE_FACTOR = 10;
-
- /** Default maximum segment size. A segment of this size
- * or larger will never be merged. @see setMaxMergeDocs */
- public static final int DEFAULT_MAX_MERGE_DOCS = Integer.MAX_VALUE;
-
- /** Default noCFSRatio. If a merge's size is >= 10% of
- * the index, then we disable compound file for it.
- * @see #setNoCFSRatio */
- public static final double DEFAULT_NO_CFS_RATIO = 0.1;
-
- protected int mergeFactor = DEFAULT_MERGE_FACTOR;
-
- protected long minMergeSize;
- protected long maxMergeSize;
- // Although the core MPs set it explicitly, we must default in case someone
- // out there wrote his own LMP ...
- protected long maxMergeSizeForOptimize = Long.MAX_VALUE;
- protected int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS;
-
- protected double noCFSRatio = DEFAULT_NO_CFS_RATIO;
-
- protected boolean calibrateSizeByDeletes = true;
-
- protected boolean useCompoundFile = true;
-
- public LogMergePolicy() {
- super();
- }
-
- protected boolean verbose() {
- IndexWriter w = writer.get();
- return w != null && w.verbose();
- }
-
- /** @see #setNoCFSRatio */
- public double getNoCFSRatio() {
- return noCFSRatio;
- }
-
- /** If a merged segment will be more than this percentage
- * of the total size of the index, leave the segment as
- * non-compound file even if compound file is enabled.
- * Set to 1.0 to always use CFS regardless of merge
- * size. */
- public void setNoCFSRatio(double noCFSRatio) {
- if (noCFSRatio < 0.0 || noCFSRatio > 1.0) {
- throw new IllegalArgumentException("noCFSRatio must be 0.0 to 1.0 inclusive; got " + noCFSRatio);
- }
- this.noCFSRatio = noCFSRatio;
- }
-
- protected void message(String message) {
- if (verbose())
- writer.get().message("LMP: " + message);
- }
-
- /** <p>Returns the number of segments that are merged at
- * once and also controls the total number of segments
- * allowed to accumulate in the index.</p> */
- public int getMergeFactor() {
- return mergeFactor;
- }
-
- /** Determines how often segment indices are merged by
- * addDocument(). With smaller values, less RAM is used
- * while indexing, and searches on unoptimized indices are
- * faster, but indexing speed is slower. With larger
- * values, more RAM is used during indexing, and while
- * searches on unoptimized indices are slower, indexing is
- * faster. Thus larger values (> 10) are best for batch
- * index creation, and smaller values (< 10) for indices
- * that are interactively maintained. */
- public void setMergeFactor(int mergeFactor) {
- if (mergeFactor < 2)
- throw new IllegalArgumentException("mergeFactor cannot be less than 2");
- this.mergeFactor = mergeFactor;
- }
-
- // Javadoc inherited
- @Override
- public boolean useCompoundFile(SegmentInfos infos, SegmentInfo mergedInfo) throws IOException {
- final boolean doCFS;
-
- if (!useCompoundFile) {
- doCFS = false;
- } else if (noCFSRatio == 1.0) {
- doCFS = true;
- } else {
- long totalSize = 0;
- for (SegmentInfo info : infos)
- totalSize += size(info);
-
- doCFS = size(mergedInfo) <= noCFSRatio * totalSize;
- }
- return doCFS;
- }
-
- /** Sets whether compound file format should be used for
- * newly flushed and newly merged segments. */
- public void setUseCompoundFile(boolean useCompoundFile) {
- this.useCompoundFile = useCompoundFile;
- }
-
- /** Returns true if newly flushed and newly merge segments
- * are written in compound file format. @see
- * #setUseCompoundFile */
- public boolean getUseCompoundFile() {
- return useCompoundFile;
- }
-
- /** Sets whether the segment size should be calibrated by
- * the number of deletes when choosing segments for merge. */
- public void setCalibrateSizeByDeletes(boolean calibrateSizeByDeletes) {
- this.calibrateSizeByDeletes = calibrateSizeByDeletes;
- }
-
- /** Returns true if the segment size should be calibrated
- * by the number of deletes when choosing segments for merge. */
- public boolean getCalibrateSizeByDeletes() {
- return calibrateSizeByDeletes;
- }
-
- @Override
- public void close() {}
-
- abstract protected long size(SegmentInfo info) throws IOException;
-
- protected long sizeDocs(SegmentInfo info) throws IOException {
- if (calibrateSizeByDeletes) {
- int delCount = writer.get().numDeletedDocs(info);
- assert delCount <= info.docCount;
- return (info.docCount - (long)delCount);
- } else {
- return info.docCount;
- }
- }
-
- protected long sizeBytes(SegmentInfo info) throws IOException {
- long byteSize = info.sizeInBytes(true);
- if (calibrateSizeByDeletes) {
- int delCount = writer.get().numDeletedDocs(info);
- double delRatio = (info.docCount <= 0 ? 0.0f : ((float)delCount / (float)info.docCount));
- assert delRatio <= 1.0;
- return (info.docCount <= 0 ? byteSize : (long)(byteSize * (1.0 - delRatio)));
- } else {
- return byteSize;
- }
- }
-
- protected boolean isOptimized(SegmentInfos infos, int maxNumSegments, Map<SegmentInfo,Boolean> segmentsToOptimize) throws IOException {
- final int numSegments = infos.size();
- int numToOptimize = 0;
- SegmentInfo optimizeInfo = null;
- boolean segmentIsOriginal = false;
- for(int i=0;i<numSegments && numToOptimize <= maxNumSegments;i++) {
- final SegmentInfo info = infos.info(i);
- final Boolean isOriginal = segmentsToOptimize.get(info);
- if (isOriginal != null) {
- segmentIsOriginal = isOriginal;
- numToOptimize++;
- optimizeInfo = info;
- }
- }
-
- return numToOptimize <= maxNumSegments &&
- (numToOptimize != 1 || !segmentIsOriginal || isOptimized(optimizeInfo));
- }
-
- /** Returns true if this single info is optimized (has no
- * pending norms or deletes, is in the same dir as the
- * writer, and matches the current compound file setting */
- protected boolean isOptimized(SegmentInfo info)
- throws IOException {
- IndexWriter w = writer.get();
- assert w != null;
- boolean hasDeletions = w.numDeletedDocs(info) > 0;
- return !hasDeletions &&
- !info.hasSeparateNorms() &&
- info.dir == w.getDirectory() &&
- (info.getUseCompoundFile() == useCompoundFile || noCFSRatio < 1.0);
- }
-
- /**
- * Returns the merges necessary to optimize the index, taking the max merge
- * size or max merge docs into consideration. This method attempts to respect
- * the {@code maxNumSegments} parameter, however it might be, due to size
- * constraints, that more than that number of segments will remain in the
- * index. Also, this method does not guarantee that exactly {@code
- * maxNumSegments} will remain, but <= that number.
- */
- private MergeSpecification findMergesForOptimizeSizeLimit(
- SegmentInfos infos, int maxNumSegments, int last) throws IOException {
- MergeSpecification spec = new MergeSpecification();
- final List<SegmentInfo> segments = infos.asList();
-
- int start = last - 1;
- while (start >= 0) {
- SegmentInfo info = infos.info(start);
- if (size(info) > maxMergeSizeForOptimize || sizeDocs(info) > maxMergeDocs) {
- if (verbose()) {
- message("optimize: skip segment=" + info + ": size is > maxMergeSize (" + maxMergeSizeForOptimize + ") or sizeDocs is > maxMergeDocs (" + maxMergeDocs + ")");
- }
- // need to skip that segment + add a merge for the 'right' segments,
- // unless there is only 1 which is optimized.
- if (last - start - 1 > 1 || (start != last - 1 && !isOptimized(infos.info(start + 1)))) {
- // there is more than 1 segment to the right of this one, or an unoptimized single segment.
- spec.add(new OneMerge(segments.subList(start + 1, last)));
- }
- last = start;
- } else if (last - start == mergeFactor) {
- // mergeFactor eligible segments were found, add them as a merge.
- spec.add(new OneMerge(segments.subList(start, last)));
- last = start;
- }
- --start;
- }
-
- // Add any left-over segments, unless there is just 1 already optimized.
- if (last > 0 && (++start + 1 < last || !isOptimized(infos.info(start)))) {
- spec.add(new OneMerge(segments.subList(start, last)));
- }
-
- return spec.merges.size() == 0 ? null : spec;
- }
-
- /**
- * Returns the merges necessary to optimize the index. This method constraints
- * the returned merges only by the {@code maxNumSegments} parameter, and
- * guaranteed that exactly that number of segments will remain in the index.
- */
- private MergeSpecification findMergesForOptimizeMaxNumSegments(SegmentInfos infos, int maxNumSegments, int last) throws IOException {
- MergeSpecification spec = new MergeSpecification();
- final List<SegmentInfo> segments = infos.asList();
-
- // First, enroll all "full" merges (size
- // mergeFactor) to potentially be run concurrently:
- while (last - maxNumSegments + 1 >= mergeFactor) {
- spec.add(new OneMerge(segments.subList(last - mergeFactor, last)));
- last -= mergeFactor;
- }
-
- // Only if there are no full merges pending do we
- // add a final partial (< mergeFactor segments) merge:
- if (0 == spec.merges.size()) {
- if (maxNumSegments == 1) {
-
- // Since we must optimize down to 1 segment, the
- // choice is simple:
- if (last > 1 || !isOptimized(infos.info(0))) {
- spec.add(new OneMerge(segments.subList(0, last)));
- }
- } else if (last > maxNumSegments) {
-
- // Take care to pick a partial merge that is
- // least cost, but does not make the index too
- // lopsided. If we always just picked the
- // partial tail then we could produce a highly
- // lopsided index over time:
-
- // We must merge this many segments to leave
- // maxNumSegments in the index (from when
- // optimize was first kicked off):
- final int finalMergeSize = last - maxNumSegments + 1;
-
- // Consider all possible starting points:
- long bestSize = 0;
- int bestStart = 0;
-
- for(int i=0;i<last-finalMergeSize+1;i++) {
- long sumSize = 0;
- for(int j=0;j<finalMergeSize;j++)
- sumSize += size(infos.info(j+i));
- if (i == 0 || (sumSize < 2*size(infos.info(i-1)) && sumSize < bestSize)) {
- bestStart = i;
- bestSize = sumSize;
- }
- }
-
- spec.add(new OneMerge(segments.subList(bestStart, bestStart + finalMergeSize)));
- }
- }
- return spec.merges.size() == 0 ? null : spec;
- }
-
- /** Returns the merges necessary to optimize the index.
- * This merge policy defines "optimized" to mean only the
- * requested number of segments is left in the index, and
- * respects the {@link #maxMergeSizeForOptimize} setting.
- * By default, and assuming {@code maxNumSegments=1}, only
- * one segment will be left in the index, where that segment
- * has no deletions pending nor separate norms, and it is in
- * compound file format if the current useCompoundFile
- * setting is true. This method returns multiple merges
- * (mergeFactor at a time) so the {@link MergeScheduler}
- * in use may make use of concurrency. */
- @Override
- public MergeSpecification findMergesForOptimize(SegmentInfos infos,
- int maxNumSegments, Map<SegmentInfo,Boolean> segmentsToOptimize) throws IOException {
-
- assert maxNumSegments > 0;
- if (verbose()) {
- message("findMergesForOptimize: maxNumSegs=" + maxNumSegments + " segsToOptimize="+ segmentsToOptimize);
- }
-
- // If the segments are already optimized (e.g. there's only 1 segment), or
- // there are <maxNumSegements, all optimized, nothing to do.
- if (isOptimized(infos, maxNumSegments, segmentsToOptimize)) return null;
-
- // Find the newest (rightmost) segment that needs to
- // be optimized (other segments may have been flushed
- // since optimize started):
- int last = infos.size();
- while (last > 0) {
- final SegmentInfo info = infos.info(--last);
- if (segmentsToOptimize.get(info) != null) {
- last++;
- break;
- }
- }
-
- if (last == 0) return null;
-
- // There is only one segment already, and it is optimized
- if (maxNumSegments == 1 && last == 1 && isOptimized(infos.info(0))) return null;
-
- // Check if there are any segments above the threshold
- boolean anyTooLarge = false;
- for (int i = 0; i < last; i++) {
- SegmentInfo info = infos.info(i);
- if (size(info) > maxMergeSizeForOptimize || sizeDocs(info) > maxMergeDocs) {
- anyTooLarge = true;
- break;
- }
- }
-
- if (anyTooLarge) {
- return findMergesForOptimizeSizeLimit(infos, maxNumSegments, last);
- } else {
- return findMergesForOptimizeMaxNumSegments(infos, maxNumSegments, last);
- }
- }
-
- /**
- * Finds merges necessary to expunge all deletes from the
- * index. We simply merge adjacent segments that have
- * deletes, up to mergeFactor at a time.
- */
- @Override
- public MergeSpecification findMergesToExpungeDeletes(SegmentInfos segmentInfos)
- throws CorruptIndexException, IOException {
- final List<SegmentInfo> segments = segmentInfos.asList();
- final int numSegments = segments.size();
-
- if (verbose())
- message("findMergesToExpungeDeletes: " + numSegments + " segments");
-
- MergeSpecification spec = new MergeSpecification();
- int firstSegmentWithDeletions = -1;
- IndexWriter w = writer.get();
- assert w != null;
- for(int i=0;i<numSegments;i++) {
- final SegmentInfo info = segmentInfos.info(i);
- int delCount = w.numDeletedDocs(info);
- if (delCount > 0) {
- if (verbose())
- message(" segment " + info.name + " has deletions");
- if (firstSegmentWithDeletions == -1)
- firstSegmentWithDeletions = i;
- else if (i - firstSegmentWithDeletions == mergeFactor) {
- // We've seen mergeFactor segments in a row with
- // deletions, so force a merge now:
- if (verbose())
- message(" add merge " + firstSegmentWithDeletions + " to " + (i-1) + " inclusive");
- spec.add(new OneMerge(segments.subList(firstSegmentWithDeletions, i)));
- firstSegmentWithDeletions = i;
- }
- } else if (firstSegmentWithDeletions != -1) {
- // End of a sequence of segments with deletions, so,
- // merge those past segments even if it's fewer than
- // mergeFactor segments
- if (verbose())
- message(" add merge " + firstSegmentWithDeletions + " to " + (i-1) + " inclusive");
- spec.add(new OneMerge(segments.subList(firstSegmentWithDeletions, i)));
- firstSegmentWithDeletions = -1;
- }
- }
-
- if (firstSegmentWithDeletions != -1) {
- if (verbose())
- message(" add merge " + firstSegmentWithDeletions + " to " + (numSegments-1) + " inclusive");
- spec.add(new OneMerge(segments.subList(firstSegmentWithDeletions, numSegments)));
- }
-
- return spec;
- }
-
- private static class SegmentInfoAndLevel implements Comparable<SegmentInfoAndLevel> {
- SegmentInfo info;
- float level;
- int index;
-
- public SegmentInfoAndLevel(SegmentInfo info, float level, int index) {
- this.info = info;
- this.level = level;
- this.index = index;
- }
-
- // Sorts largest to smallest
- public int compareTo(SegmentInfoAndLevel other) {
- if (level < other.level)
- return 1;
- else if (level > other.level)
- return -1;
- else
- return 0;
- }
- }
-
- /** Checks if any merges are now necessary and returns a
- * {@link MergePolicy.MergeSpecification} if so. A merge
- * is necessary when there are more than {@link
- * #setMergeFactor} segments at a given level. When
- * multiple levels have too many segments, this method
- * will return multiple merges, allowing the {@link
- * MergeScheduler} to use concurrency. */
- @Override
- public MergeSpecification findMerges(SegmentInfos infos) throws IOException {
-
- final int numSegments = infos.size();
- if (verbose())
- message("findMerges: " + numSegments + " segments");
-
- // Compute levels, which is just log (base mergeFactor)
- // of the size of each segment
- final List<SegmentInfoAndLevel> levels = new ArrayList<SegmentInfoAndLevel>();
- final float norm = (float) Math.log(mergeFactor);
-
- final Collection<SegmentInfo> mergingSegments = writer.get().getMergingSegments();
-
- for(int i=0;i<numSegments;i++) {
- final SegmentInfo info = infos.info(i);
- long size = size(info);
-
- // Floor tiny segments
- if (size < 1) {
- size = 1;
- }
-
- final SegmentInfoAndLevel infoLevel = new SegmentInfoAndLevel(info, (float) Math.log(size)/norm, i);
- levels.add(infoLevel);
-
- if (verbose()) {
- final long segBytes = sizeBytes(info);
- String extra = mergingSegments.contains(info) ? " [merging]" : "";
- if (size >= maxMergeSize) {
- extra += " [skip: too large]";
- }
- message("seg=" + writer.get().segString(info) + " level=" + infoLevel.level + " size=" + String.format("%.3f MB", segBytes/1024/1024.) + extra);
- }
- }
-
- final float levelFloor;
- if (minMergeSize <= 0)
- levelFloor = (float) 0.0;
- else
- levelFloor = (float) (Math.log(minMergeSize)/norm);
-
- // Now, we quantize the log values into levels. The
- // first level is any segment whose log size is within
- // LEVEL_LOG_SPAN of the max size, or, who has such as
- // segment "to the right". Then, we find the max of all
- // other segments and use that to define the next level
- // segment, etc.
-
- MergeSpecification spec = null;
-
- final int numMergeableSegments = levels.size();
-
- int start = 0;
- while(start < numMergeableSegments) {
-
- // Find max level of all segments not already
- // quantized.
- float maxLevel = levels.get(start).level;
- for(int i=1+start;i<numMergeableSegments;i++) {
- final float level = levels.get(i).level;
- if (level > maxLevel)
- maxLevel = level;
- }
-
- // Now search backwards for the rightmost segment that
- // falls into this level:
- float levelBottom;
- if (maxLevel <= levelFloor)
- // All remaining segments fall into the min level
- levelBottom = -1.0F;
- else {
- levelBottom = (float) (maxLevel - LEVEL_LOG_SPAN);
-
- // Force a boundary at the level floor
- if (levelBottom < levelFloor && maxLevel >= levelFloor)
- levelBottom = levelFloor;
- }
-
- int upto = numMergeableSegments-1;
- while(upto >= start) {
- if (levels.get(upto).level >= levelBottom) {
- break;
- }
- upto--;
- }
- if (verbose())
- message(" level " + levelBottom + " to " + maxLevel + ": " + (1+upto-start) + " segments");
-
- // Finally, record all merges that are viable at this level:
- int end = start + mergeFactor;
- while(end <= 1+upto) {
- boolean anyTooLarge = false;
- boolean anyMerging = false;
- for(int i=start;i<end;i++) {
- final SegmentInfo info = levels.get(i).info;
- anyTooLarge |= (size(info) >= maxMergeSize || sizeDocs(info) >= maxMergeDocs);
- if (mergingSegments.contains(info)) {
- anyMerging = true;
- break;
- }
- }
-
- if (anyMerging) {
- // skip
- } else if (!anyTooLarge) {
- if (spec == null)
- spec = new MergeSpecification();
- final List<SegmentInfo> mergeInfos = new ArrayList<SegmentInfo>();
- for(int i=start;i<end;i++) {
- mergeInfos.add(levels.get(i).info);
- assert infos.contains(levels.get(i).info);
- }
- if (verbose()) {
- message(" add merge=" + writer.get().segString(mergeInfos) + " start=" + start + " end=" + end);
- }
- spec.add(new OneMerge(mergeInfos));
- } else if (verbose()) {
- message(" " + start + " to " + end + ": contains segment over maxMergeSize or maxMergeDocs; skipping");
- }
-
- start = end;
- end = start + mergeFactor;
- }
-
- start = 1+upto;
- }
-
- return spec;
- }
-
- /** <p>Determines the largest segment (measured by
- * document count) that may be merged with other segments.
- * Small values (e.g., less than 10,000) are best for
- * interactive indexing, as this limits the length of
- * pauses while indexing to a few seconds. Larger values
- * are best for batched indexing and speedier
- * searches.</p>
- *
- * <p>The default value is {@link Integer#MAX_VALUE}.</p>
- *
- * <p>The default merge policy ({@link
- * LogByteSizeMergePolicy}) also allows you to set this
- * limit by net size (in MB) of the segment, using {@link
- * LogByteSizeMergePolicy#setMaxMergeMB}.</p>
- */
- public void setMaxMergeDocs(int maxMergeDocs) {
- this.maxMergeDocs = maxMergeDocs;
- }
-
- /** Returns the largest segment (measured by document
- * count) that may be merged with other segments.
- * @see #setMaxMergeDocs */
- public int getMaxMergeDocs() {
- return maxMergeDocs;
- }
-
- @Override
- public String toString() {
- StringBuilder sb = new StringBuilder("[" + getClass().getSimpleName() + ": ");
- sb.append("minMergeSize=").append(minMergeSize).append(", ");
- sb.append("mergeFactor=").append(mergeFactor).append(", ");
- sb.append("maxMergeSize=").append(maxMergeSize).append(", ");
- sb.append("maxMergeSizeForOptimize=").append(maxMergeSizeForOptimize).append(", ");
- sb.append("calibrateSizeByDeletes=").append(calibrateSizeByDeletes).append(", ");
- sb.append("maxMergeDocs=").append(maxMergeDocs).append(", ");
- sb.append("useCompoundFile=").append(useCompoundFile).append(", ");
- sb.append("noCFSRatio=").append(noCFSRatio);
- sb.append("]");
- return sb.toString();
- }
-
-}