X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java?ds=sidebyside diff --git a/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java new file mode 100644 index 0000000..71e23a9 --- /dev/null +++ b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java @@ -0,0 +1,173 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.index.TermDocs; // for javadocs + +/** + * A {@link Filter} that only accepts documents whose single + * term value in the specified field is contained in the + * provided set of allowed terms. + * + *

+ * + * This is the same functionality as TermsFilter (from + * contrib/queries), except this filter requires that the + * field contains only a single term for all documents. + * Because of drastically different implementations, they + * also have different performance characteristics, as + * described below. + * + *

+ * + * The first invocation of this filter on a given field will + * be slower, since a {@link FieldCache.StringIndex} must be + * created. Subsequent invocations using the same field + * will re-use this cache. However, as with all + * functionality based on {@link FieldCache}, persistent RAM + * is consumed to hold the cache, and is not freed until the + * {@link IndexReader} is closed. In contrast, TermsFilter + * has no persistent RAM consumption. + * + * + *

+ * + * With each search, this filter translates the specified + * set of Terms into a private {@link FixedBitSet} keyed by + * term number per unique {@link IndexReader} (normally one + * reader per segment). Then, during matching, the term + * number for each docID is retrieved from the cache and + * then checked for inclusion using the {@link FixedBitSet}. + * Since all testing is done using RAM resident data + * structures, performance should be very fast, most likely + * fast enough to not require further caching of the + * DocIdSet for each possible combination of terms. + * However, because docIDs are simply scanned linearly, an + * index with a great many small documents may find this + * linear scan too costly. + * + *

+ * + * In contrast, TermsFilter builds up an {@link FixedBitSet}, + * keyed by docID, every time it's created, by enumerating + * through all matching docs using {@link TermDocs} to seek + * and scan through each term's docID list. While there is + * no linear scan of all docIDs, besides the allocation of + * the underlying array in the {@link FixedBitSet}, this + * approach requires a number of "disk seeks" in proportion + * to the number of terms, which can be exceptionally costly + * when there are cache misses in the OS's IO cache. + * + *

+ * + * Generally, this filter will be slower on the first + * invocation for a given field, but subsequent invocations, + * even if you change the allowed set of Terms, should be + * faster than TermsFilter, especially as the number of + * Terms being matched increases. If you are matching only + * a very small number of terms, and those terms in turn + * match a very small number of documents, TermsFilter may + * perform faster. + * + *

+ * + * Which filter is best is very application dependent. + */ + +public class FieldCacheTermsFilter extends Filter { + private String field; + private String[] terms; + + public FieldCacheTermsFilter(String field, String... terms) { + this.field = field; + this.terms = terms; + } + + public FieldCache getFieldCache() { + return FieldCache.DEFAULT; + } + + @Override + public DocIdSet getDocIdSet(IndexReader reader) throws IOException { + return new FieldCacheTermsFilterDocIdSet(getFieldCache().getStringIndex(reader, field)); + } + + protected class FieldCacheTermsFilterDocIdSet extends DocIdSet { + private FieldCache.StringIndex fcsi; + + private FixedBitSet bits; + + public FieldCacheTermsFilterDocIdSet(FieldCache.StringIndex fcsi) { + this.fcsi = fcsi; + bits = new FixedBitSet(this.fcsi.lookup.length); + for (int i=0;i 0) { + bits.set(termNumber); + } + } + } + + @Override + public DocIdSetIterator iterator() { + return new FieldCacheTermsFilterDocIdSetIterator(); + } + + /** This DocIdSet implementation is cacheable. */ + @Override + public boolean isCacheable() { + return true; + } + + protected class FieldCacheTermsFilterDocIdSetIterator extends DocIdSetIterator { + private int doc = -1; + + @Override + public int docID() { + return doc; + } + + @Override + public int nextDoc() { + try { + while (!bits.get(fcsi.order[++doc])) {} + } catch (ArrayIndexOutOfBoundsException e) { + doc = NO_MORE_DOCS; + } + return doc; + } + + @Override + public int advance(int target) { + try { + doc = target; + while (!bits.get(fcsi.order[doc])) { + doc++; + } + } catch (ArrayIndexOutOfBoundsException e) { + doc = NO_MORE_DOCS; + } + return doc; + } + } + } +}