lucene-java-3.4.0/lucene/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java

   1 package org.apache.lucene.search;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21
  22 import org.apache.lucene.index.IndexReader;
  23 import org.apache.lucene.util.FixedBitSet;
  24 import org.apache.lucene.index.TermDocs;  // for javadocs
  25
  26 /**
  27  * A {@link Filter} that only accepts documents whose single
  28  * term value in the specified field is contained in the
  29  * provided set of allowed terms.
  30  *
  31  * <p/>
  32  *
  33  * This is the same functionality as TermsFilter (from
  34  * contrib/queries), except this filter requires that the
  35  * field contains only a single term for all documents.
  36  * Because of drastically different implementations, they
  37  * also have different performance characteristics, as
  38  * described below.
  39  *
  40  * <p/>
  41  *
  42  * The first invocation of this filter on a given field will
  43  * be slower, since a {@link FieldCache.StringIndex} must be
  44  * created.  Subsequent invocations using the same field
  45  * will re-use this cache.  However, as with all
  46  * functionality based on {@link FieldCache}, persistent RAM
  47  * is consumed to hold the cache, and is not freed until the
  48  * {@link IndexReader} is closed.  In contrast, TermsFilter
  49  * has no persistent RAM consumption.
  50  *
  51  *
  52  * <p/>
  53  *
  54  * With each search, this filter translates the specified
  55  * set of Terms into a private {@link FixedBitSet} keyed by
  56  * term number per unique {@link IndexReader} (normally one
  57  * reader per segment).  Then, during matching, the term
  58  * number for each docID is retrieved from the cache and
  59  * then checked for inclusion using the {@link FixedBitSet}.
  60  * Since all testing is done using RAM resident data
  61  * structures, performance should be very fast, most likely
  62  * fast enough to not require further caching of the
  63  * DocIdSet for each possible combination of terms.
  64  * However, because docIDs are simply scanned linearly, an
  65  * index with a great many small documents may find this
  66  * linear scan too costly.
  67  *
  68  * <p/>
  69  *
  70  * In contrast, TermsFilter builds up an {@link FixedBitSet},
  71  * keyed by docID, every time it's created, by enumerating
  72  * through all matching docs using {@link TermDocs} to seek
  73  * and scan through each term's docID list.  While there is
  74  * no linear scan of all docIDs, besides the allocation of
  75  * the underlying array in the {@link FixedBitSet}, this
  76  * approach requires a number of "disk seeks" in proportion
  77  * to the number of terms, which can be exceptionally costly
  78  * when there are cache misses in the OS's IO cache.
  79  *
  80  * <p/>
  81  *
  82  * Generally, this filter will be slower on the first
  83  * invocation for a given field, but subsequent invocations,
  84  * even if you change the allowed set of Terms, should be
  85  * faster than TermsFilter, especially as the number of
  86  * Terms being matched increases.  If you are matching only
  87  * a very small number of terms, and those terms in turn
  88  * match a very small number of documents, TermsFilter may
  89  * perform faster.
  90  *
  91  * <p/>
  92  *
  93  * Which filter is best is very application dependent.
  94  */
  95
  96 public class FieldCacheTermsFilter extends Filter {
  97   private String field;
  98   private String[] terms;
  99
 100   public FieldCacheTermsFilter(String field, String... terms) {
 101     this.field = field;
 102     this.terms = terms;
 103   }
 104
 105   public FieldCache getFieldCache() {
 106     return FieldCache.DEFAULT;
 107   }
 108
 109   @Override
 110   public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
 111     return new FieldCacheTermsFilterDocIdSet(getFieldCache().getStringIndex(reader, field));
 112   }
 113
 114   protected class FieldCacheTermsFilterDocIdSet extends DocIdSet {
 115     private FieldCache.StringIndex fcsi;
 116
 117     private FixedBitSet bits;
 118
 119     public FieldCacheTermsFilterDocIdSet(FieldCache.StringIndex fcsi) {
 120       this.fcsi = fcsi;
 121       bits = new FixedBitSet(this.fcsi.lookup.length);
 122       for (int i=0;i<terms.length;i++) {
 123         int termNumber = this.fcsi.binarySearchLookup(terms[i]);
 124         if (termNumber > 0) {
 125           bits.set(termNumber);
 126         }
 127       }
 128     }
 129
 130     @Override
 131     public DocIdSetIterator iterator() {
 132       return new FieldCacheTermsFilterDocIdSetIterator();
 133     }
 134
 135     /** This DocIdSet implementation is cacheable. */
 136     @Override
 137     public boolean isCacheable() {
 138       return true;
 139     }
 140
 141     protected class FieldCacheTermsFilterDocIdSetIterator extends DocIdSetIterator {
 142       private int doc = -1;
 143
 144       @Override
 145       public int docID() {
 146         return doc;
 147       }
 148
 149       @Override
 150       public int nextDoc() {
 151         try {
 152           while (!bits.get(fcsi.order[++doc])) {}
 153         } catch (ArrayIndexOutOfBoundsException e) {
 154           doc = NO_MORE_DOCS;
 155         }
 156         return doc;
 157       }
 158
 159       @Override
 160       public int advance(int target) {
 161         try {
 162           doc = target;
 163           while (!bits.get(fcsi.order[doc])) {
 164             doc++;
 165           }
 166         } catch (ArrayIndexOutOfBoundsException e) {
 167           doc = NO_MORE_DOCS;
 168         }
 169         return doc;
 170       }
 171     }
 172   }
 173 }