lucene-java-3.4.0/lucene/src/java/org/apache/lucene/index/TermInfosReader.java

   1 package org.apache.lucene.index;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.Closeable;
  21 import java.io.IOException;
  22
  23 import org.apache.lucene.store.Directory;
  24 import org.apache.lucene.util.DoubleBarrelLRUCache;
  25 import org.apache.lucene.util.CloseableThreadLocal;
  26
  27 /** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
  28  * Directory.  Pairs are accessed either by Term or by ordinal position the
  29  * set.  */
  30
  31 final class TermInfosReader implements Closeable {
  32   private final Directory directory;
  33   private final String segment;
  34   private final FieldInfos fieldInfos;
  35
  36   private final CloseableThreadLocal<ThreadResources> threadResources = new CloseableThreadLocal<ThreadResources>();
  37   private final SegmentTermEnum origEnum;
  38   private final long size;
  39
  40   private final Term[] indexTerms;
  41   private final TermInfo[] indexInfos;
  42   private final long[] indexPointers;
  43
  44   private final int totalIndexInterval;
  45
  46   private final static int DEFAULT_CACHE_SIZE = 1024;
  47
  48   // Just adds term's ord to TermInfo
  49   private final static class TermInfoAndOrd extends TermInfo {
  50     final long termOrd;
  51     public TermInfoAndOrd(TermInfo ti, long termOrd) {
  52       super(ti);
  53       assert termOrd >= 0;
  54       this.termOrd = termOrd;
  55     }
  56   }
  57
  58   private static class CloneableTerm extends DoubleBarrelLRUCache.CloneableKey {
  59     private final Term term;
  60
  61     public CloneableTerm(Term t) {
  62       this.term = new Term(t.field(), t.text());
  63     }
  64
  65     @Override
  66     public Object clone() {
  67       return new CloneableTerm(term);
  68     }
  69
  70     @Override
  71     public boolean equals(Object _other) {
  72       CloneableTerm other = (CloneableTerm) _other;
  73       return term.equals(other.term);
  74     }
  75
  76     @Override
  77     public int hashCode() {
  78       return term.hashCode();
  79     }
  80   }
  81
  82   private final DoubleBarrelLRUCache<CloneableTerm,TermInfoAndOrd> termsCache = new DoubleBarrelLRUCache<CloneableTerm,TermInfoAndOrd>(DEFAULT_CACHE_SIZE);
  83
  84   /**
  85    * Per-thread resources managed by ThreadLocal
  86    */
  87   private static final class ThreadResources {
  88     SegmentTermEnum termEnum;
  89   }
  90
  91   TermInfosReader(Directory dir, String seg, FieldInfos fis, int readBufferSize, int indexDivisor)
  92        throws CorruptIndexException, IOException {
  93     boolean success = false;
  94
  95     if (indexDivisor < 1 && indexDivisor != -1) {
  96       throw new IllegalArgumentException("indexDivisor must be -1 (don't load terms index) or greater than 0: got " + indexDivisor);
  97     }
  98
  99     try {
 100       directory = dir;
 101       segment = seg;
 102       fieldInfos = fis;
 103
 104       origEnum = new SegmentTermEnum(directory.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.TERMS_EXTENSION),
 105           readBufferSize), fieldInfos, false);
 106       size = origEnum.size;
 107
 108
 109       if (indexDivisor != -1) {
 110         // Load terms index
 111         totalIndexInterval = origEnum.indexInterval * indexDivisor;
 112         final SegmentTermEnum indexEnum = new SegmentTermEnum(directory.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.TERMS_INDEX_EXTENSION),
 113                                                                                   readBufferSize), fieldInfos, true);
 114
 115         try {
 116           int indexSize = 1+((int)indexEnum.size-1)/indexDivisor;  // otherwise read index
 117
 118           indexTerms = new Term[indexSize];
 119           indexInfos = new TermInfo[indexSize];
 120           indexPointers = new long[indexSize];
 121
 122           for (int i = 0; indexEnum.next(); i++) {
 123             indexTerms[i] = indexEnum.term();
 124             indexInfos[i] = indexEnum.termInfo();
 125             indexPointers[i] = indexEnum.indexPointer;
 126
 127             for (int j = 1; j < indexDivisor; j++)
 128               if (!indexEnum.next())
 129                 break;
 130           }
 131         } finally {
 132           indexEnum.close();
 133         }
 134       } else {
 135         // Do not load terms index:
 136         totalIndexInterval = -1;
 137         indexTerms = null;
 138         indexInfos = null;
 139         indexPointers = null;
 140       }
 141       success = true;
 142     } finally {
 143       // With lock-less commits, it's entirely possible (and
 144       // fine) to hit a FileNotFound exception above. In
 145       // this case, we want to explicitly close any subset
 146       // of things that were opened so that we don't have to
 147       // wait for a GC to do so.
 148       if (!success) {
 149         close();
 150       }
 151     }
 152   }
 153
 154   public int getSkipInterval() {
 155     return origEnum.skipInterval;
 156   }
 157
 158   public int getMaxSkipLevels() {
 159     return origEnum.maxSkipLevels;
 160   }
 161
 162   public final void close() throws IOException {
 163     if (origEnum != null)
 164       origEnum.close();
 165     threadResources.close();
 166   }
 167
 168   /** Returns the number of term/value pairs in the set. */
 169   final long size() {
 170     return size;
 171   }
 172
 173   private ThreadResources getThreadResources() {
 174     ThreadResources resources = threadResources.get();
 175     if (resources == null) {
 176       resources = new ThreadResources();
 177       resources.termEnum = terms();
 178       threadResources.set(resources);
 179     }
 180     return resources;
 181   }
 182
 183
 184   /** Returns the offset of the greatest index entry which is less than or equal to term.*/
 185   private final int getIndexOffset(Term term) {
 186     int lo = 0;                                   // binary search indexTerms[]
 187     int hi = indexTerms.length - 1;
 188
 189     while (hi >= lo) {
 190       int mid = (lo + hi) >>> 1;
 191       int delta = term.compareTo(indexTerms[mid]);
 192       if (delta < 0)
 193         hi = mid - 1;
 194       else if (delta > 0)
 195         lo = mid + 1;
 196       else
 197         return mid;
 198     }
 199     return hi;
 200   }
 201
 202   private final void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
 203     enumerator.seek(indexPointers[indexOffset],
 204                    ((long) indexOffset * totalIndexInterval) - 1,
 205                    indexTerms[indexOffset], indexInfos[indexOffset]);
 206   }
 207
 208   /** Returns the TermInfo for a Term in the set, or null. */
 209   TermInfo get(Term term) throws IOException {
 210     return get(term, false);
 211   }
 212
 213   /** Returns the TermInfo for a Term in the set, or null. */
 214   private TermInfo get(Term term, boolean mustSeekEnum) throws IOException {
 215     if (size == 0) return null;
 216
 217     ensureIndexIsRead();
 218
 219     final CloneableTerm cacheKey = new CloneableTerm(term);
 220
 221     TermInfoAndOrd tiOrd = termsCache.get(cacheKey);
 222     ThreadResources resources = getThreadResources();
 223
 224     if (!mustSeekEnum && tiOrd != null) {
 225       return tiOrd;
 226     }
 227
 228     // optimize sequential access: first try scanning cached enum w/o seeking
 229     SegmentTermEnum enumerator = resources.termEnum;
 230     if (enumerator.term() != null                 // term is at or past current
 231         && ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0)
 232             || term.compareTo(enumerator.term()) >= 0)) {
 233       int enumOffset = (int)(enumerator.position/totalIndexInterval)+1;
 234       if (indexTerms.length == enumOffset         // but before end of block
 235     || term.compareTo(indexTerms[enumOffset]) < 0) {
 236        // no need to seek
 237
 238         final TermInfo ti;
 239
 240         int numScans = enumerator.scanTo(term);
 241         if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) {
 242           ti = enumerator.termInfo();
 243           if (numScans > 1) {
 244             // we only  want to put this TermInfo into the cache if
 245             // scanEnum skipped more than one dictionary entry.
 246             // This prevents RangeQueries or WildcardQueries to
 247             // wipe out the cache when they iterate over a large numbers
 248             // of terms in order
 249             if (tiOrd == null) {
 250               termsCache.put(cacheKey, new TermInfoAndOrd(ti, enumerator.position));
 251             } else {
 252               assert sameTermInfo(ti, tiOrd, enumerator);
 253               assert (int) enumerator.position == tiOrd.termOrd;
 254             }
 255           }
 256         } else {
 257           ti = null;
 258         }
 259
 260         return ti;
 261       }
 262     }
 263
 264     // random-access: must seek
 265     final int indexPos;
 266     if (tiOrd != null) {
 267       indexPos = (int) (tiOrd.termOrd / totalIndexInterval);
 268     } else {
 269       // Must do binary search:
 270       indexPos = getIndexOffset(term);
 271     }
 272
 273     seekEnum(enumerator, indexPos);
 274     enumerator.scanTo(term);
 275     final TermInfo ti;
 276     if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) {
 277       ti = enumerator.termInfo();
 278       if (tiOrd == null) {
 279         // LUCENE-3183: it's possible, if term is Term("",
 280         // ""), for the STE to be incorrectly un-positioned
 281         // after scan-to; work around this by not caching in
 282         // this case:
 283         if (enumerator.position >= 0) {
 284           termsCache.put(cacheKey, new TermInfoAndOrd(ti, enumerator.position));
 285         }
 286       } else {
 287         assert sameTermInfo(ti, tiOrd, enumerator);
 288         assert enumerator.position == tiOrd.termOrd;
 289       }
 290     } else {
 291       ti = null;
 292     }
 293     return ti;
 294   }
 295
 296   // called only from asserts
 297   private final boolean sameTermInfo(TermInfo ti1, TermInfo ti2, SegmentTermEnum enumerator) {
 298     if (ti1.docFreq != ti2.docFreq) {
 299       return false;
 300     }
 301     if (ti1.freqPointer != ti2.freqPointer) {
 302       return false;
 303     }
 304     if (ti1.proxPointer != ti2.proxPointer) {
 305       return false;
 306     }
 307     // skipOffset is only valid when docFreq >= skipInterval:
 308     if (ti1.docFreq >= enumerator.skipInterval &&
 309         ti1.skipOffset != ti2.skipOffset) {
 310       return false;
 311     }
 312     return true;
 313   }
 314
 315   private void ensureIndexIsRead() {
 316     if (indexTerms == null) {
 317       throw new IllegalStateException("terms index was not loaded when this reader was created");
 318     }
 319   }
 320
 321   /** Returns the position of a Term in the set or -1. */
 322   final long getPosition(Term term) throws IOException {
 323     if (size == 0) return -1;
 324
 325     ensureIndexIsRead();
 326     int indexOffset = getIndexOffset(term);
 327
 328     SegmentTermEnum enumerator = getThreadResources().termEnum;
 329     seekEnum(enumerator, indexOffset);
 330
 331     while(term.compareTo(enumerator.term()) > 0 && enumerator.next()) {}
 332
 333     if (term.compareTo(enumerator.term()) == 0)
 334       return enumerator.position;
 335     else
 336       return -1;
 337   }
 338
 339   /** Returns an enumeration of all the Terms and TermInfos in the set. */
 340   public SegmentTermEnum terms() {
 341     return (SegmentTermEnum)origEnum.clone();
 342   }
 343
 344   /** Returns an enumeration of terms starting at or after the named term. */
 345   public SegmentTermEnum terms(Term term) throws IOException {
 346     get(term, true);
 347     return (SegmentTermEnum)getThreadResources().termEnum.clone();
 348   }
 349 }