X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/src/java/org/apache/lucene/index/TieredMergePolicy.java?ds=sidebyside diff --git a/lucene-java-3.4.0/lucene/src/java/org/apache/lucene/index/TieredMergePolicy.java b/lucene-java-3.4.0/lucene/src/java/org/apache/lucene/index/TieredMergePolicy.java deleted file mode 100644 index 653afad..0000000 --- a/lucene-java-3.4.0/lucene/src/java/org/apache/lucene/index/TieredMergePolicy.java +++ /dev/null @@ -1,693 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.Map; -import java.util.Collection; -import java.util.Collections; -import java.util.HashSet; -import java.util.Comparator; -import java.util.List; -import java.util.ArrayList; - -/** - * Merges segments of approximately equal size, subject to - * an allowed number of segments per tier. This is similar - * to {@link LogByteSizeMergePolicy}, except this merge - * policy is able to merge non-adjacent segment, and - * separates how many segments are merged at once ({@link - * #setMaxMergeAtOnce}) from how many segments are allowed - * per tier ({@link #setSegmentsPerTier}). This merge - * policy also does not over-merge (ie, cascade merges). - * - *

For normal merging, this policy first computes a - * "budget" of how many segments are allowed by be in the - * index. If the index is over-budget, then the policy - * sorts segments by decresing size (pro-rating by percent - * deletes), and then finds the least-cost merge. Merge - * cost is measured by a combination of the "skew" of the - * merge (size of largest seg divided by smallest seg), - * total merge size and pct deletes reclaimed, - * so that merges with lower skew, smaller size - * and those reclaiming more deletes, are - * favored. - * - *

If a merge will produce a segment that's larger than - * {@link #setMaxMergedSegmentMB}, then the policy will - * merge fewer segments (down to 1 at once, if that one has - * deletions) to keep the segment size under budget. - * - * NOTE: this policy freely merges non-adjacent - * segments; if this is a problem, use {@link - * LogMergePolicy}. - * - *

NOTE: This policy always merges by byte size - * of the segments, always pro-rates by percent deletes, - * and does not apply any maximum segment size during - * optimize (unlike {@link LogByteSizeMergePolicy}. - * - * @lucene.experimental - */ - -// TODO -// - we could try to take into account whether a large -// merge is already running (under CMS) and then bias -// ourselves towards picking smaller merges if so (or, -// maybe CMS should do so) - -public class TieredMergePolicy extends MergePolicy { - - private int maxMergeAtOnce = 10; - private long maxMergedSegmentBytes = 5*1024*1024*1024L; - private int maxMergeAtOnceExplicit = 30; - - private long floorSegmentBytes = 2*1024*1024L; - private double segsPerTier = 10.0; - private double expungeDeletesPctAllowed = 10.0; - private boolean useCompoundFile = true; - private double noCFSRatio = 0.1; - private double reclaimDeletesWeight = 2.0; - - /** Maximum number of segments to be merged at a time - * during "normal" merging. For explicit merging (eg, - * optimize or expungeDeletes was called), see {@link - * #setMaxMergeAtOnceExplicit}. Default is 10. */ - public TieredMergePolicy setMaxMergeAtOnce(int v) { - if (v < 2) { - throw new IllegalArgumentException("maxMergeAtOnce must be > 1 (got " + v + ")"); - } - maxMergeAtOnce = v; - return this; - } - - /** @see #setMaxMergeAtOnce */ - public int getMaxMergeAtOnce() { - return maxMergeAtOnce; - } - - // TODO: should addIndexes do explicit merging, too? And, - // if user calls IW.maybeMerge "explicitly" - - /** Maximum number of segments to be merged at a time, - * during optimize or expungeDeletes. Default is 30. */ - public TieredMergePolicy setMaxMergeAtOnceExplicit(int v) { - if (v < 2) { - throw new IllegalArgumentException("maxMergeAtOnceExplicit must be > 1 (got " + v + ")"); - } - maxMergeAtOnceExplicit = v; - return this; - } - - /** @see #setMaxMergeAtOnceExplicit */ - public int getMaxMergeAtOnceExplicit() { - return maxMergeAtOnceExplicit; - } - - /** Maximum sized segment to produce during - * normal merging. This setting is approximate: the - * estimate of the merged segment size is made by summing - * sizes of to-be-merged segments (compensating for - * percent deleted docs). Default is 5 GB. */ - public TieredMergePolicy setMaxMergedSegmentMB(double v) { - maxMergedSegmentBytes = (long) (v*1024*1024); - return this; - } - - /** @see #getMaxMergedSegmentMB */ - public double getMaxMergedSegmentMB() { - return maxMergedSegmentBytes/1024/1024.; - } - - /** Controls how aggressively merges that reclaim more - * deletions are favored. Higher values favor selecting - * merges that reclaim deletions. A value of 0.0 means - * deletions don't impact merge selection. */ - public TieredMergePolicy setReclaimDeletesWeight(double v) { - if (v < 0.0) { - throw new IllegalArgumentException("reclaimDeletesWeight must be >= 0.0 (got " + v + ")"); - } - reclaimDeletesWeight = v; - return this; - } - - /** See {@link #setReclaimDeletesWeight}. */ - public double getReclaimDeletesWeight() { - return reclaimDeletesWeight; - } - - /** Segments smaller than this are "rounded up" to this - * size, ie treated as equal (floor) size for merge - * selection. This is to prevent frequent flushing of - * tiny segments from allowing a long tail in the index. - * Default is 2 MB. */ - public TieredMergePolicy setFloorSegmentMB(double v) { - if (v <= 0.0) { - throw new IllegalArgumentException("floorSegmentMB must be >= 0.0 (got " + v + ")"); - } - floorSegmentBytes = (long) (v*1024*1024); - return this; - } - - /** @see #setFloorSegmentMB */ - public double getFloorSegmentMB() { - return floorSegmentBytes/1024*1024.; - } - - /** When expungeDeletes is called, we only merge away a - * segment if its delete percentage is over this - * threshold. Default is 10%. */ - public TieredMergePolicy setExpungeDeletesPctAllowed(double v) { - if (v < 0.0 || v > 100.0) { - throw new IllegalArgumentException("expungeDeletesPctAllowed must be between 0.0 and 100.0 inclusive (got " + v + ")"); - } - expungeDeletesPctAllowed = v; - return this; - } - - /** @see #setExpungeDeletesPctAllowed */ - public double getExpungeDeletesPctAllowed() { - return expungeDeletesPctAllowed; - } - - /** Sets the allowed number of segments per tier. Smaller - * values mean more merging but fewer segments. - * - *

NOTE: this value should be >= the {@link - * #setMaxMergeAtOnce} otherwise you'll force too much - * merging to occur.

- * - *

Default is 10.0.

*/ - public TieredMergePolicy setSegmentsPerTier(double v) { - if (v < 2.0) { - throw new IllegalArgumentException("segmentsPerTier must be >= 2.0 (got " + v + ")"); - } - segsPerTier = v; - return this; - } - - /** @see #setSegmentsPerTier */ - public double getSegmentsPerTier() { - return segsPerTier; - } - - /** Sets whether compound file format should be used for - * newly flushed and newly merged segments. Default - * true. */ - public TieredMergePolicy setUseCompoundFile(boolean useCompoundFile) { - this.useCompoundFile = useCompoundFile; - return this; - } - - /** @see #setUseCompoundFile */ - public boolean getUseCompoundFile() { - return useCompoundFile; - } - - /** If a merged segment will be more than this percentage - * of the total size of the index, leave the segment as - * non-compound file even if compound file is enabled. - * Set to 1.0 to always use CFS regardless of merge - * size. Default is 0.1. */ - public TieredMergePolicy setNoCFSRatio(double noCFSRatio) { - if (noCFSRatio < 0.0 || noCFSRatio > 1.0) { - throw new IllegalArgumentException("noCFSRatio must be 0.0 to 1.0 inclusive; got " + noCFSRatio); - } - this.noCFSRatio = noCFSRatio; - return this; - } - - /** @see #setNoCFSRatio */ - public double getNoCFSRatio() { - return noCFSRatio; - } - - private class SegmentByteSizeDescending implements Comparator { - public int compare(SegmentInfo o1, SegmentInfo o2) { - try { - final long sz1 = size(o1); - final long sz2 = size(o2); - if (sz1 > sz2) { - return -1; - } else if (sz2 > sz1) { - return 1; - } else { - return o1.name.compareTo(o2.name); - } - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - } - } - - private final Comparator segmentByteSizeDescending = new SegmentByteSizeDescending(); - - protected static abstract class MergeScore { - abstract double getScore(); - abstract String getExplanation(); - } - - @Override - public MergeSpecification findMerges(SegmentInfos infos) throws IOException { - if (verbose()) { - message("findMerges: " + infos.size() + " segments"); - } - if (infos.size() == 0) { - return null; - } - final Collection merging = writer.get().getMergingSegments(); - final Collection toBeMerged = new HashSet(); - - final List infosSorted = new ArrayList(infos.asList()); - Collections.sort(infosSorted, segmentByteSizeDescending); - - // Compute total index bytes & print details about the index - long totIndexBytes = 0; - long minSegmentBytes = Long.MAX_VALUE; - for(SegmentInfo info : infosSorted) { - final long segBytes = size(info); - if (verbose()) { - String extra = merging.contains(info) ? " [merging]" : ""; - if (segBytes >= maxMergedSegmentBytes/2.0) { - extra += " [skip: too large]"; - } else if (segBytes < floorSegmentBytes) { - extra += " [floored]"; - } - message(" seg=" + writer.get().segString(info) + " size=" + String.format("%.3f", segBytes/1024/1024.) + " MB" + extra); - } - - minSegmentBytes = Math.min(segBytes, minSegmentBytes); - // Accum total byte size - totIndexBytes += segBytes; - } - - // If we have too-large segments, grace them out - // of the maxSegmentCount: - int tooBigCount = 0; - while (tooBigCount < infosSorted.size() && size(infosSorted.get(tooBigCount)) >= maxMergedSegmentBytes/2.0) { - totIndexBytes -= size(infosSorted.get(tooBigCount)); - tooBigCount++; - } - - minSegmentBytes = floorSize(minSegmentBytes); - - // Compute max allowed segs in the index - long levelSize = minSegmentBytes; - long bytesLeft = totIndexBytes; - double allowedSegCount = 0; - while(true) { - final double segCountLevel = bytesLeft / (double) levelSize; - if (segCountLevel < segsPerTier) { - allowedSegCount += Math.ceil(segCountLevel); - break; - } - allowedSegCount += segsPerTier; - bytesLeft -= segsPerTier * levelSize; - levelSize *= maxMergeAtOnce; - } - int allowedSegCountInt = (int) allowedSegCount; - - MergeSpecification spec = null; - - // Cycle to possibly select more than one merge: - while(true) { - - long mergingBytes = 0; - - // Gather eligible segments for merging, ie segments - // not already being merged and not already picked (by - // prior iteration of this loop) for merging: - final List eligible = new ArrayList(); - for(int idx = tooBigCount; idx= maxMergedSegmentBytes; - - message(" allowedSegmentCount=" + allowedSegCountInt + " vs count=" + infosSorted.size() + " (eligible count=" + eligible.size() + ") tooBigCount=" + tooBigCount); - - if (eligible.size() == 0) { - return spec; - } - - if (eligible.size() >= allowedSegCountInt) { - - // OK we are over budget -- find best merge! - MergeScore bestScore = null; - List best = null; - boolean bestTooLarge = false; - long bestMergeBytes = 0; - - // Consider all merge starts: - for(int startIdx = 0;startIdx <= eligible.size()-maxMergeAtOnce; startIdx++) { - - long totAfterMergeBytes = 0; - - final List candidate = new ArrayList(); - boolean hitTooLarge = false; - for(int idx = startIdx;idx maxMergedSegmentBytes) { - hitTooLarge = true; - // NOTE: we continue, so that we can try - // "packing" smaller segments into this merge - // to see if we can get closer to the max - // size; this in general is not perfect since - // this is really "bin packing" and we'd have - // to try different permutations. - continue; - } - candidate.add(info); - totAfterMergeBytes += segBytes; - } - - final MergeScore score = score(candidate, hitTooLarge, mergingBytes); - message(" maybe=" + writer.get().segString(candidate) + " score=" + score.getScore() + " " + score.getExplanation() + " tooLarge=" + hitTooLarge + " size=" + String.format("%.3f MB", totAfterMergeBytes/1024./1024.)); - - // If we are already running a max sized merge - // (maxMergeIsRunning), don't allow another max - // sized merge to kick off: - if ((bestScore == null || score.getScore() < bestScore.getScore()) && (!hitTooLarge || !maxMergeIsRunning)) { - best = candidate; - bestScore = score; - bestTooLarge = hitTooLarge; - bestMergeBytes = totAfterMergeBytes; - } - } - - if (best != null) { - if (spec == null) { - spec = new MergeSpecification(); - } - final OneMerge merge = new OneMerge(best); - spec.add(merge); - for(SegmentInfo info : merge.segments) { - toBeMerged.add(info); - } - - if (verbose()) { - message(" add merge=" + writer.get().segString(merge.segments) + " size=" + String.format("%.3f MB", bestMergeBytes/1024./1024.) + " score=" + String.format("%.3f", bestScore.getScore()) + " " + bestScore.getExplanation() + (bestTooLarge ? " [max merge]" : "")); - } - } else { - return spec; - } - } else { - return spec; - } - } - } - - /** Expert: scores one merge; subclasses can override. */ - protected MergeScore score(List candidate, boolean hitTooLarge, long mergingBytes) throws IOException { - long totBeforeMergeBytes = 0; - long totAfterMergeBytes = 0; - long totAfterMergeBytesFloored = 0; - for(SegmentInfo info : candidate) { - final long segBytes = size(info); - totAfterMergeBytes += segBytes; - totAfterMergeBytesFloored += floorSize(segBytes); - totBeforeMergeBytes += info.sizeInBytes(true); - } - - // Measure "skew" of the merge, which can range - // from 1.0/numSegsBeingMerged (good) to 1.0 - // (poor): - final double skew; - if (hitTooLarge) { - // Pretend the merge has perfect skew; skew doesn't - // matter in this case because this merge will not - // "cascade" and so it cannot lead to N^2 merge cost - // over time: - skew = 1.0/maxMergeAtOnce; - } else { - skew = ((double) floorSize(size(candidate.get(0))))/totAfterMergeBytesFloored; - } - - // Strongly favor merges with less skew (smaller - // mergeScore is better): - double mergeScore = skew; - - // Gently favor smaller merges over bigger ones. We - // don't want to make this exponent too large else we - // can end up doing poor merges of small segments in - // order to avoid the large merges: - mergeScore *= Math.pow(totAfterMergeBytes, 0.05); - - // Strongly favor merges that reclaim deletes: - final double nonDelRatio = ((double) totAfterMergeBytes)/totBeforeMergeBytes; - mergeScore *= Math.pow(nonDelRatio, reclaimDeletesWeight); - - final double finalMergeScore = mergeScore; - - return new MergeScore() { - - @Override - public double getScore() { - return finalMergeScore; - } - - @Override - public String getExplanation() { - return "skew=" + String.format("%.3f", skew) + " nonDelRatio=" + String.format("%.3f", nonDelRatio); - } - }; - } - - @Override - public MergeSpecification findMergesForOptimize(SegmentInfos infos, int maxSegmentCount, Map segmentsToOptimize) throws IOException { - if (verbose()) { - message("findMergesForOptimize maxSegmentCount=" + maxSegmentCount + " infos=" + writer.get().segString(infos) + " segmentsToOptimize=" + segmentsToOptimize); - } - - List eligible = new ArrayList(); - boolean optimizeMergeRunning = false; - final Collection merging = writer.get().getMergingSegments(); - boolean segmentIsOriginal = false; - for(SegmentInfo info : infos) { - final Boolean isOriginal = segmentsToOptimize.get(info); - if (isOriginal != null) { - segmentIsOriginal = isOriginal; - if (!merging.contains(info)) { - eligible.add(info); - } else { - optimizeMergeRunning = true; - } - } - } - - if (eligible.size() == 0) { - return null; - } - - if ((maxSegmentCount > 1 && eligible.size() <= maxSegmentCount) || - (maxSegmentCount == 1 && eligible.size() == 1 && (!segmentIsOriginal || isOptimized(eligible.get(0))))) { - if (verbose()) { - message("already optimized"); - } - return null; - } - - Collections.sort(eligible, segmentByteSizeDescending); - - if (verbose()) { - message("eligible=" + eligible); - message("optimizeMergeRunning=" + optimizeMergeRunning); - } - - int end = eligible.size(); - - MergeSpecification spec = null; - - // Do full merges, first, backwards: - while(end >= maxMergeAtOnceExplicit + maxSegmentCount - 1) { - if (spec == null) { - spec = new MergeSpecification(); - } - final OneMerge merge = new OneMerge(eligible.subList(end-maxMergeAtOnceExplicit, end)); - if (verbose()) { - message("add merge=" + writer.get().segString(merge.segments)); - } - spec.add(merge); - end -= maxMergeAtOnceExplicit; - } - - if (spec == null && !optimizeMergeRunning) { - // Do final merge - final int numToMerge = end - maxSegmentCount + 1; - final OneMerge merge = new OneMerge(eligible.subList(end-numToMerge, end)); - if (verbose()) { - message("add final merge=" + merge.segString(writer.get().getDirectory())); - } - spec = new MergeSpecification(); - spec.add(merge); - } - - return spec; - } - - @Override - public MergeSpecification findMergesToExpungeDeletes(SegmentInfos infos) - throws CorruptIndexException, IOException { - if (verbose()) { - message("findMergesToExpungeDeletes infos=" + writer.get().segString(infos) + " expungeDeletesPctAllowed=" + expungeDeletesPctAllowed); - } - final List eligible = new ArrayList(); - final Collection merging = writer.get().getMergingSegments(); - for(SegmentInfo info : infos) { - double pctDeletes = 100.*((double) writer.get().numDeletedDocs(info))/info.docCount; - if (pctDeletes > expungeDeletesPctAllowed && !merging.contains(info)) { - eligible.add(info); - } - } - - if (eligible.size() == 0) { - return null; - } - - Collections.sort(eligible, segmentByteSizeDescending); - - if (verbose()) { - message("eligible=" + eligible); - } - - int start = 0; - MergeSpecification spec = null; - - while(start < eligible.size()) { - long totAfterMergeBytes = 0; - int upto = start; - boolean done = false; - while(upto < start + maxMergeAtOnceExplicit) { - if (upto == eligible.size()) { - done = true; - break; - } - final SegmentInfo info = eligible.get(upto); - final long segBytes = size(info); - if (totAfterMergeBytes + segBytes > maxMergedSegmentBytes) { - // TODO: we could be smarter here, eg cherry - // picking smaller merges that'd sum up to just - // around the max size - break; - } - totAfterMergeBytes += segBytes; - upto++; - } - - if (upto == start) { - // Single segment is too big; grace it - start++; - continue; - } - - if (spec == null) { - spec = new MergeSpecification(); - } - - final OneMerge merge = new OneMerge(eligible.subList(start, upto)); - if (verbose()) { - message("add merge=" + writer.get().segString(merge.segments)); - } - spec.add(merge); - start = upto; - if (done) { - break; - } - } - - return spec; - } - - @Override - public boolean useCompoundFile(SegmentInfos infos, SegmentInfo mergedInfo) throws IOException { - final boolean doCFS; - - if (!useCompoundFile) { - doCFS = false; - } else if (noCFSRatio == 1.0) { - doCFS = true; - } else { - long totalSize = 0; - for (SegmentInfo info : infos) - totalSize += size(info); - - doCFS = size(mergedInfo) <= noCFSRatio * totalSize; - } - return doCFS; - } - - @Override - public void close() { - } - - private boolean isOptimized(SegmentInfo info) - throws IOException { - IndexWriter w = writer.get(); - assert w != null; - boolean hasDeletions = w.numDeletedDocs(info) > 0; - return !hasDeletions && - !info.hasSeparateNorms() && - info.dir == w.getDirectory() && - (info.getUseCompoundFile() == useCompoundFile || noCFSRatio < 1.0); - } - - // Segment size in bytes, pro-rated by % deleted - private long size(SegmentInfo info) throws IOException { - final long byteSize = info.sizeInBytes(true); - final int delCount = writer.get().numDeletedDocs(info); - final double delRatio = (info.docCount <= 0 ? 0.0f : ((double)delCount / (double)info.docCount)); - assert delRatio <= 1.0; - return (long) (byteSize * (1.0-delRatio)); - } - - private long floorSize(long bytes) { - return Math.max(floorSegmentBytes, bytes); - } - - private boolean verbose() { - IndexWriter w = writer.get(); - return w != null && w.verbose(); - } - - private void message(String message) { - if (verbose()) { - writer.get().message("TMP: " + message); - } - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder("[" + getClass().getSimpleName() + ": "); - sb.append("maxMergeAtOnce=").append(maxMergeAtOnce).append(", "); - sb.append("maxMergeAtOnceExplicit=").append(maxMergeAtOnceExplicit).append(", "); - sb.append("maxMergedSegmentMB=").append(maxMergedSegmentBytes/1024/1024.).append(", "); - sb.append("floorSegmentMB=").append(floorSegmentBytes/1024/1024.).append(", "); - sb.append("expungeDeletesPctAllowed=").append(expungeDeletesPctAllowed).append(", "); - sb.append("segmentsPerTier=").append(segsPerTier).append(", "); - sb.append("useCompoundFile=").append(useCompoundFile).append(", "); - sb.append("noCFSRatio=").append(noCFSRatio); - return sb.toString(); - } -} \ No newline at end of file