+++ /dev/null
-package org.apache.lucene.search;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.util.FixedBitSet;
-import org.apache.lucene.index.TermDocs; // for javadocs
-
-/**
- * A {@link Filter} that only accepts documents whose single
- * term value in the specified field is contained in the
- * provided set of allowed terms.
- *
- * <p/>
- *
- * This is the same functionality as TermsFilter (from
- * contrib/queries), except this filter requires that the
- * field contains only a single term for all documents.
- * Because of drastically different implementations, they
- * also have different performance characteristics, as
- * described below.
- *
- * <p/>
- *
- * The first invocation of this filter on a given field will
- * be slower, since a {@link FieldCache.StringIndex} must be
- * created. Subsequent invocations using the same field
- * will re-use this cache. However, as with all
- * functionality based on {@link FieldCache}, persistent RAM
- * is consumed to hold the cache, and is not freed until the
- * {@link IndexReader} is closed. In contrast, TermsFilter
- * has no persistent RAM consumption.
- *
- *
- * <p/>
- *
- * With each search, this filter translates the specified
- * set of Terms into a private {@link FixedBitSet} keyed by
- * term number per unique {@link IndexReader} (normally one
- * reader per segment). Then, during matching, the term
- * number for each docID is retrieved from the cache and
- * then checked for inclusion using the {@link FixedBitSet}.
- * Since all testing is done using RAM resident data
- * structures, performance should be very fast, most likely
- * fast enough to not require further caching of the
- * DocIdSet for each possible combination of terms.
- * However, because docIDs are simply scanned linearly, an
- * index with a great many small documents may find this
- * linear scan too costly.
- *
- * <p/>
- *
- * In contrast, TermsFilter builds up an {@link FixedBitSet},
- * keyed by docID, every time it's created, by enumerating
- * through all matching docs using {@link TermDocs} to seek
- * and scan through each term's docID list. While there is
- * no linear scan of all docIDs, besides the allocation of
- * the underlying array in the {@link FixedBitSet}, this
- * approach requires a number of "disk seeks" in proportion
- * to the number of terms, which can be exceptionally costly
- * when there are cache misses in the OS's IO cache.
- *
- * <p/>
- *
- * Generally, this filter will be slower on the first
- * invocation for a given field, but subsequent invocations,
- * even if you change the allowed set of Terms, should be
- * faster than TermsFilter, especially as the number of
- * Terms being matched increases. If you are matching only
- * a very small number of terms, and those terms in turn
- * match a very small number of documents, TermsFilter may
- * perform faster.
- *
- * <p/>
- *
- * Which filter is best is very application dependent.
- */
-
-public class FieldCacheTermsFilter extends Filter {
- private String field;
- private String[] terms;
-
- public FieldCacheTermsFilter(String field, String... terms) {
- this.field = field;
- this.terms = terms;
- }
-
- public FieldCache getFieldCache() {
- return FieldCache.DEFAULT;
- }
-
- @Override
- public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
- return new FieldCacheTermsFilterDocIdSet(getFieldCache().getStringIndex(reader, field));
- }
-
- protected class FieldCacheTermsFilterDocIdSet extends DocIdSet {
- private FieldCache.StringIndex fcsi;
-
- private FixedBitSet bits;
-
- public FieldCacheTermsFilterDocIdSet(FieldCache.StringIndex fcsi) {
- this.fcsi = fcsi;
- bits = new FixedBitSet(this.fcsi.lookup.length);
- for (int i=0;i<terms.length;i++) {
- int termNumber = this.fcsi.binarySearchLookup(terms[i]);
- if (termNumber > 0) {
- bits.set(termNumber);
- }
- }
- }
-
- @Override
- public DocIdSetIterator iterator() {
- return new FieldCacheTermsFilterDocIdSetIterator();
- }
-
- /** This DocIdSet implementation is cacheable. */
- @Override
- public boolean isCacheable() {
- return true;
- }
-
- protected class FieldCacheTermsFilterDocIdSetIterator extends DocIdSetIterator {
- private int doc = -1;
-
- @Override
- public int docID() {
- return doc;
- }
-
- @Override
- public int nextDoc() {
- try {
- while (!bits.get(fcsi.order[++doc])) {}
- } catch (ArrayIndexOutOfBoundsException e) {
- doc = NO_MORE_DOCS;
- }
- return doc;
- }
-
- @Override
- public int advance(int target) {
- try {
- doc = target;
- while (!bits.get(fcsi.order[doc])) {
- doc++;
- }
- } catch (ArrayIndexOutOfBoundsException e) {
- doc = NO_MORE_DOCS;
- }
- return doc;
- }
- }
- }
-}