X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/search/ConstantScoreAutoRewrite.java diff --git a/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/search/ConstantScoreAutoRewrite.java b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/search/ConstantScoreAutoRewrite.java new file mode 100644 index 0000000..51e3b45 --- /dev/null +++ b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/search/ConstantScoreAutoRewrite.java @@ -0,0 +1,164 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; + +class ConstantScoreAutoRewrite extends TermCollectingRewrite { + + // Defaults derived from rough tests with a 20.0 million + // doc Wikipedia index. With more than 350 terms in the + // query, the filter method is fastest: + public static int DEFAULT_TERM_COUNT_CUTOFF = 350; + + // If the query will hit more than 1 in 1000 of the docs + // in the index (0.1%), the filter method is fastest: + public static double DEFAULT_DOC_COUNT_PERCENT = 0.1; + + private int termCountCutoff = DEFAULT_TERM_COUNT_CUTOFF; + private double docCountPercent = DEFAULT_DOC_COUNT_PERCENT; + + /** If the number of terms in this query is equal to or + * larger than this setting then {@link + * #CONSTANT_SCORE_FILTER_REWRITE} is used. */ + public void setTermCountCutoff(int count) { + termCountCutoff = count; + } + + /** @see #setTermCountCutoff */ + public int getTermCountCutoff() { + return termCountCutoff; + } + + /** If the number of documents to be visited in the + * postings exceeds this specified percentage of the + * maxDoc() for the index, then {@link + * #CONSTANT_SCORE_FILTER_REWRITE} is used. + * @param percent 0.0 to 100.0 */ + public void setDocCountPercent(double percent) { + docCountPercent = percent; + } + + /** @see #setDocCountPercent */ + public double getDocCountPercent() { + return docCountPercent; + } + + @Override + protected BooleanQuery getTopLevelQuery() { + return new BooleanQuery(true); + } + + @Override + protected void addClause(BooleanQuery topLevel, Term term, float boost /*ignored*/) { + topLevel.add(new TermQuery(term), BooleanClause.Occur.SHOULD); + } + + @Override + public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException { + + // Get the enum and start visiting terms. If we + // exhaust the enum before hitting either of the + // cutoffs, we use ConstantBooleanQueryRewrite; else, + // ConstantFilterRewrite: + final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc()); + final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff); + + final CutOffTermCollector col = new CutOffTermCollector(reader, docCountCutoff, termCountLimit); + collectTerms(reader, query, col); + + if (col.hasCutOff) { + return MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query); + } else { + final Query result; + if (col.pendingTerms.isEmpty()) { + result = getTopLevelQuery(); + } else { + BooleanQuery bq = getTopLevelQuery(); + for(Term term : col.pendingTerms) { + addClause(bq, term, 1.0f); + } + // Strip scores + result = new ConstantScoreQuery(bq); + result.setBoost(query.getBoost()); + } + query.incTotalNumberOfTerms(col.pendingTerms.size()); + return result; + } + } + + private static final class CutOffTermCollector implements TermCollector { + CutOffTermCollector(IndexReader reader, int docCountCutoff, int termCountLimit) { + this.reader = reader; + this.docCountCutoff = docCountCutoff; + this.termCountLimit = termCountLimit; + } + + public boolean collect(Term t, float boost) throws IOException { + pendingTerms.add(t); + // Loading the TermInfo from the terms dict here + // should not be costly, because 1) the + // query/filter will load the TermInfo when it + // runs, and 2) the terms dict has a cache: + docVisitCount += reader.docFreq(t); + if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) { + hasCutOff = true; + return false; + } + return true; + } + + int docVisitCount = 0; + boolean hasCutOff = false; + + final IndexReader reader; + final int docCountCutoff, termCountLimit; + final ArrayList pendingTerms = new ArrayList(); + } + + @Override + public int hashCode() { + final int prime = 1279; + return (int) (prime * termCountCutoff + Double.doubleToLongBits(docCountPercent)); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + + ConstantScoreAutoRewrite other = (ConstantScoreAutoRewrite) obj; + if (other.termCountCutoff != termCountCutoff) { + return false; + } + + if (Double.doubleToLongBits(other.docCountPercent) != Double.doubleToLongBits(docCountPercent)) { + return false; + } + + return true; + } +}