1 package org.apache.lucene.search;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
22 import org.apache.lucene.index.IndexReader;
23 import org.apache.lucene.util.FixedBitSet;
24 import org.apache.lucene.index.TermDocs; // for javadocs
27 * A {@link Filter} that only accepts documents whose single
28 * term value in the specified field is contained in the
29 * provided set of allowed terms.
33 * This is the same functionality as TermsFilter (from
34 * contrib/queries), except this filter requires that the
35 * field contains only a single term for all documents.
36 * Because of drastically different implementations, they
37 * also have different performance characteristics, as
42 * The first invocation of this filter on a given field will
43 * be slower, since a {@link FieldCache.StringIndex} must be
44 * created. Subsequent invocations using the same field
45 * will re-use this cache. However, as with all
46 * functionality based on {@link FieldCache}, persistent RAM
47 * is consumed to hold the cache, and is not freed until the
48 * {@link IndexReader} is closed. In contrast, TermsFilter
49 * has no persistent RAM consumption.
54 * With each search, this filter translates the specified
55 * set of Terms into a private {@link FixedBitSet} keyed by
56 * term number per unique {@link IndexReader} (normally one
57 * reader per segment). Then, during matching, the term
58 * number for each docID is retrieved from the cache and
59 * then checked for inclusion using the {@link FixedBitSet}.
60 * Since all testing is done using RAM resident data
61 * structures, performance should be very fast, most likely
62 * fast enough to not require further caching of the
63 * DocIdSet for each possible combination of terms.
64 * However, because docIDs are simply scanned linearly, an
65 * index with a great many small documents may find this
66 * linear scan too costly.
70 * In contrast, TermsFilter builds up an {@link FixedBitSet},
71 * keyed by docID, every time it's created, by enumerating
72 * through all matching docs using {@link TermDocs} to seek
73 * and scan through each term's docID list. While there is
74 * no linear scan of all docIDs, besides the allocation of
75 * the underlying array in the {@link FixedBitSet}, this
76 * approach requires a number of "disk seeks" in proportion
77 * to the number of terms, which can be exceptionally costly
78 * when there are cache misses in the OS's IO cache.
82 * Generally, this filter will be slower on the first
83 * invocation for a given field, but subsequent invocations,
84 * even if you change the allowed set of Terms, should be
85 * faster than TermsFilter, especially as the number of
86 * Terms being matched increases. If you are matching only
87 * a very small number of terms, and those terms in turn
88 * match a very small number of documents, TermsFilter may
93 * Which filter is best is very application dependent.
96 public class FieldCacheTermsFilter extends Filter {
98 private String[] terms;
100 public FieldCacheTermsFilter(String field, String... terms) {
105 public FieldCache getFieldCache() {
106 return FieldCache.DEFAULT;
110 public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
111 return new FieldCacheTermsFilterDocIdSet(getFieldCache().getStringIndex(reader, field));
114 protected class FieldCacheTermsFilterDocIdSet extends DocIdSet {
115 private FieldCache.StringIndex fcsi;
117 private FixedBitSet bits;
119 public FieldCacheTermsFilterDocIdSet(FieldCache.StringIndex fcsi) {
121 bits = new FixedBitSet(this.fcsi.lookup.length);
122 for (int i=0;i<terms.length;i++) {
123 int termNumber = this.fcsi.binarySearchLookup(terms[i]);
124 if (termNumber > 0) {
125 bits.set(termNumber);
131 public DocIdSetIterator iterator() {
132 return new FieldCacheTermsFilterDocIdSetIterator();
135 /** This DocIdSet implementation is cacheable. */
137 public boolean isCacheable() {
141 protected class FieldCacheTermsFilterDocIdSetIterator extends DocIdSetIterator {
142 private int doc = -1;
150 public int nextDoc() {
152 while (!bits.get(fcsi.order[++doc])) {}
153 } catch (ArrayIndexOutOfBoundsException e) {
160 public int advance(int target) {
163 while (!bits.get(fcsi.order[doc])) {
166 } catch (ArrayIndexOutOfBoundsException e) {