lucene-java-3.5.0/lucene/src/java/org/apache/lucene/index/TermInfosReaderIndex.java

   1 package org.apache.lucene.index;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.util.ArrayList;
  22 import java.util.Comparator;
  23 import java.util.List;
  24
  25 import org.apache.lucene.util.BitUtil;
  26 import org.apache.lucene.util.BytesRef;
  27 import org.apache.lucene.util.PagedBytes.PagedBytesDataInput;
  28 import org.apache.lucene.util.PagedBytes.PagedBytesDataOutput;
  29 import org.apache.lucene.util.PagedBytes;
  30 import org.apache.lucene.util.packed.GrowableWriter;
  31 import org.apache.lucene.util.packed.PackedInts;
  32
  33 /**
  34  * This stores a monotonically increasing set of <Term, TermInfo> pairs in an
  35  * index segment. Pairs are accessed either by Term or by ordinal position the
  36  * set. The Terms and TermInfo are actually serialized and stored into a byte
  37  * array and pointers to the position of each are stored in a int array.
  38  */
  39 class TermInfosReaderIndex {
  40
  41   private static final int MAX_PAGE_BITS = 18; // 256 KB block
  42   private Term[] fields;
  43   private int totalIndexInterval;
  44   private Comparator<BytesRef> comparator = BytesRef.getUTF8SortedAsUTF16Comparator();
  45   private final PagedBytesDataInput dataInput;
  46   private final PackedInts.Reader indexToDataOffset;
  47   private final int indexSize;
  48   private final int skipInterval;
  49
  50   /**
  51    * Loads the segment information at segment load time.
  52    *
  53    * @param indexEnum
  54    *          the term enum.
  55    * @param indexDivisor
  56    *          the index divisor.
  57    * @param tiiFileLength
  58    *          the size of the tii file, used to approximate the size of the
  59    *          buffer.
  60    * @param totalIndexInterval
  61    *          the total index interval.
  62    */
  63   TermInfosReaderIndex(SegmentTermEnum indexEnum, int indexDivisor, long tiiFileLength, int totalIndexInterval) throws IOException {
  64     this.totalIndexInterval = totalIndexInterval;
  65     indexSize = 1 + ((int) indexEnum.size - 1) / indexDivisor;
  66     skipInterval = indexEnum.skipInterval;
  67     // this is only an inital size, it will be GCed once the build is complete
  68     long initialSize = (long) (tiiFileLength * 1.5) / indexDivisor;
  69     PagedBytes dataPagedBytes = new PagedBytes(estimatePageBits(initialSize));
  70     PagedBytesDataOutput dataOutput = dataPagedBytes.getDataOutput();
  71
  72     GrowableWriter indexToTerms = new GrowableWriter(4, indexSize, false);
  73     String currentField = null;
  74     List<String> fieldStrs = new ArrayList<String>();
  75     int fieldCounter = -1;
  76     for (int i = 0; indexEnum.next(); i++) {
  77       Term term = indexEnum.term();
  78       if (currentField != term.field) {
  79         currentField = term.field;
  80         fieldStrs.add(currentField);
  81         fieldCounter++;
  82       }
  83       TermInfo termInfo = indexEnum.termInfo();
  84       indexToTerms.set(i, dataOutput.getPosition());
  85       dataOutput.writeVInt(fieldCounter);
  86       dataOutput.writeString(term.text());
  87       dataOutput.writeVInt(termInfo.docFreq);
  88       if (termInfo.docFreq >= skipInterval) {
  89         dataOutput.writeVInt(termInfo.skipOffset);
  90       }
  91       dataOutput.writeVLong(termInfo.freqPointer);
  92       dataOutput.writeVLong(termInfo.proxPointer);
  93       dataOutput.writeVLong(indexEnum.indexPointer);
  94       for (int j = 1; j < indexDivisor; j++) {
  95         if (!indexEnum.next()) {
  96           break;
  97         }
  98       }
  99     }
 100
 101     fields = new Term[fieldStrs.size()];
 102     for (int i = 0; i < fields.length; i++) {
 103       fields[i] = new Term(fieldStrs.get(i));
 104     }
 105
 106     dataPagedBytes.freeze(true);
 107     dataInput = dataPagedBytes.getDataInput();
 108     indexToDataOffset = indexToTerms.getMutable();
 109   }
 110
 111   private static int estimatePageBits(long estSize) {
 112     return Math.max(Math.min(64 - BitUtil.nlz(estSize), MAX_PAGE_BITS), 4);
 113   }
 114
 115   void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
 116     PagedBytesDataInput input = (PagedBytesDataInput) dataInput.clone();
 117
 118     input.setPosition(indexToDataOffset.get(indexOffset));
 119
 120     // read the term
 121     int fieldId = input.readVInt();
 122     Term field = fields[fieldId];
 123     Term term = field.createTerm(input.readString());
 124
 125     // read the terminfo
 126     TermInfo termInfo = new TermInfo();
 127     termInfo.docFreq = input.readVInt();
 128     if (termInfo.docFreq >= skipInterval) {
 129       termInfo.skipOffset = input.readVInt();
 130     } else {
 131       termInfo.skipOffset = 0;
 132     }
 133     termInfo.freqPointer = input.readVLong();
 134     termInfo.proxPointer = input.readVLong();
 135
 136     long pointer = input.readVLong();
 137
 138     // perform the seek
 139     enumerator.seek(pointer, ((long) indexOffset * totalIndexInterval) - 1, term, termInfo);
 140   }
 141
 142   /**
 143    * Binary search for the given term.
 144    *
 145    * @param term
 146    *          the term to locate.
 147    * @throws IOException
 148    */
 149   int getIndexOffset(Term term, BytesRef termBytesRef) throws IOException {
 150     int lo = 0;
 151     int hi = indexSize - 1;
 152     PagedBytesDataInput input = (PagedBytesDataInput) dataInput.clone();
 153     BytesRef scratch = new BytesRef();
 154     while (hi >= lo) {
 155       int mid = (lo + hi) >>> 1;
 156       int delta = compareTo(term, termBytesRef, mid, input, scratch);
 157       if (delta < 0)
 158         hi = mid - 1;
 159       else if (delta > 0)
 160         lo = mid + 1;
 161       else
 162         return mid;
 163     }
 164     return hi;
 165   }
 166
 167   /**
 168    * Gets the term at the given position.  For testing.
 169    *
 170    * @param termIndex
 171    *          the position to read the term from the index.
 172    * @return the term.
 173    * @throws IOException
 174    */
 175   Term getTerm(int termIndex) throws IOException {
 176     PagedBytesDataInput input = (PagedBytesDataInput) dataInput.clone();
 177     input.setPosition(indexToDataOffset.get(termIndex));
 178
 179     // read the term
 180     int fieldId = input.readVInt();
 181     Term field = fields[fieldId];
 182     return field.createTerm(input.readString());
 183   }
 184
 185   /**
 186    * Returns the number of terms.
 187    *
 188    * @return int.
 189    */
 190   int length() {
 191     return indexSize;
 192   }
 193
 194   /**
 195    * The compares the given term against the term in the index specified by the
 196    * term index. ie It returns negative N when term is less than index term;
 197    *
 198    * @param term
 199    *          the given term.
 200    * @param termIndex
 201    *          the index of the of term to compare.
 202    * @return int.
 203    * @throws IOException
 204    */
 205   int compareTo(Term term, BytesRef termBytesRef, int termIndex) throws IOException {
 206     return compareTo(term, termBytesRef, termIndex, (PagedBytesDataInput) dataInput.clone(), new BytesRef());
 207   }
 208
 209   /**
 210    * Compare the fields of the terms first, and if not equals return from
 211    * compare. If equal compare terms.
 212    *
 213    * @param term
 214    *          the term to compare.
 215    * @param termIndex
 216    *          the position of the term in the input to compare
 217    * @param input
 218    *          the input buffer.
 219    * @return int.
 220    * @throws IOException
 221    */
 222   private int compareTo(Term term, BytesRef termBytesRef, int termIndex, PagedBytesDataInput input, BytesRef reuse) throws IOException {
 223     // if term field does not equal mid's field index, then compare fields
 224     // else if they are equal, compare term's string values...
 225     int c = compareField(term, termIndex, input);
 226     if (c == 0) {
 227       reuse.length = input.readVInt();
 228       reuse.grow(reuse.length);
 229       input.readBytes(reuse.bytes, 0, reuse.length);
 230       return comparator.compare(termBytesRef, reuse);
 231     }
 232     return c;
 233   }
 234
 235   /**
 236    * Compares the fields before checking the text of the terms.
 237    *
 238    * @param term
 239    *          the given term.
 240    * @param termIndex
 241    *          the term that exists in the data block.
 242    * @param input
 243    *          the data block.
 244    * @return int.
 245    * @throws IOException
 246    */
 247   private int compareField(Term term, int termIndex, PagedBytesDataInput input) throws IOException {
 248     input.setPosition(indexToDataOffset.get(termIndex));
 249     return term.field.compareTo(fields[input.readVInt()].field);
 250   }
 251 }