1 package org.apache.lucene.index;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.util.ArrayList;
22 import java.util.Comparator;
23 import java.util.List;
25 import org.apache.lucene.util.BitUtil;
26 import org.apache.lucene.util.BytesRef;
27 import org.apache.lucene.util.PagedBytes.PagedBytesDataInput;
28 import org.apache.lucene.util.PagedBytes.PagedBytesDataOutput;
29 import org.apache.lucene.util.PagedBytes;
30 import org.apache.lucene.util.packed.GrowableWriter;
31 import org.apache.lucene.util.packed.PackedInts;
34 * This stores a monotonically increasing set of <Term, TermInfo> pairs in an
35 * index segment. Pairs are accessed either by Term or by ordinal position the
36 * set. The Terms and TermInfo are actually serialized and stored into a byte
37 * array and pointers to the position of each are stored in a int array.
39 class TermInfosReaderIndex {
41 private static final int MAX_PAGE_BITS = 18; // 256 KB block
42 private Term[] fields;
43 private int totalIndexInterval;
44 private Comparator<BytesRef> comparator = BytesRef.getUTF8SortedAsUTF16Comparator();
45 private final PagedBytesDataInput dataInput;
46 private final PackedInts.Reader indexToDataOffset;
47 private final int indexSize;
48 private final int skipInterval;
51 * Loads the segment information at segment load time.
57 * @param tiiFileLength
58 * the size of the tii file, used to approximate the size of the
60 * @param totalIndexInterval
61 * the total index interval.
63 TermInfosReaderIndex(SegmentTermEnum indexEnum, int indexDivisor, long tiiFileLength, int totalIndexInterval) throws IOException {
64 this.totalIndexInterval = totalIndexInterval;
65 indexSize = 1 + ((int) indexEnum.size - 1) / indexDivisor;
66 skipInterval = indexEnum.skipInterval;
67 // this is only an inital size, it will be GCed once the build is complete
68 long initialSize = (long) (tiiFileLength * 1.5) / indexDivisor;
69 PagedBytes dataPagedBytes = new PagedBytes(estimatePageBits(initialSize));
70 PagedBytesDataOutput dataOutput = dataPagedBytes.getDataOutput();
72 GrowableWriter indexToTerms = new GrowableWriter(4, indexSize, false);
73 String currentField = null;
74 List<String> fieldStrs = new ArrayList<String>();
75 int fieldCounter = -1;
76 for (int i = 0; indexEnum.next(); i++) {
77 Term term = indexEnum.term();
78 if (currentField != term.field) {
79 currentField = term.field;
80 fieldStrs.add(currentField);
83 TermInfo termInfo = indexEnum.termInfo();
84 indexToTerms.set(i, dataOutput.getPosition());
85 dataOutput.writeVInt(fieldCounter);
86 dataOutput.writeString(term.text());
87 dataOutput.writeVInt(termInfo.docFreq);
88 if (termInfo.docFreq >= skipInterval) {
89 dataOutput.writeVInt(termInfo.skipOffset);
91 dataOutput.writeVLong(termInfo.freqPointer);
92 dataOutput.writeVLong(termInfo.proxPointer);
93 dataOutput.writeVLong(indexEnum.indexPointer);
94 for (int j = 1; j < indexDivisor; j++) {
95 if (!indexEnum.next()) {
101 fields = new Term[fieldStrs.size()];
102 for (int i = 0; i < fields.length; i++) {
103 fields[i] = new Term(fieldStrs.get(i));
106 dataPagedBytes.freeze(true);
107 dataInput = dataPagedBytes.getDataInput();
108 indexToDataOffset = indexToTerms.getMutable();
111 private static int estimatePageBits(long estSize) {
112 return Math.max(Math.min(64 - BitUtil.nlz(estSize), MAX_PAGE_BITS), 4);
115 void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
116 PagedBytesDataInput input = (PagedBytesDataInput) dataInput.clone();
118 input.setPosition(indexToDataOffset.get(indexOffset));
121 int fieldId = input.readVInt();
122 Term field = fields[fieldId];
123 Term term = field.createTerm(input.readString());
126 TermInfo termInfo = new TermInfo();
127 termInfo.docFreq = input.readVInt();
128 if (termInfo.docFreq >= skipInterval) {
129 termInfo.skipOffset = input.readVInt();
131 termInfo.skipOffset = 0;
133 termInfo.freqPointer = input.readVLong();
134 termInfo.proxPointer = input.readVLong();
136 long pointer = input.readVLong();
139 enumerator.seek(pointer, ((long) indexOffset * totalIndexInterval) - 1, term, termInfo);
143 * Binary search for the given term.
146 * the term to locate.
147 * @throws IOException
149 int getIndexOffset(Term term, BytesRef termBytesRef) throws IOException {
151 int hi = indexSize - 1;
152 PagedBytesDataInput input = (PagedBytesDataInput) dataInput.clone();
153 BytesRef scratch = new BytesRef();
155 int mid = (lo + hi) >>> 1;
156 int delta = compareTo(term, termBytesRef, mid, input, scratch);
168 * Gets the term at the given position. For testing.
171 * the position to read the term from the index.
173 * @throws IOException
175 Term getTerm(int termIndex) throws IOException {
176 PagedBytesDataInput input = (PagedBytesDataInput) dataInput.clone();
177 input.setPosition(indexToDataOffset.get(termIndex));
180 int fieldId = input.readVInt();
181 Term field = fields[fieldId];
182 return field.createTerm(input.readString());
186 * Returns the number of terms.
195 * The compares the given term against the term in the index specified by the
196 * term index. ie It returns negative N when term is less than index term;
201 * the index of the of term to compare.
203 * @throws IOException
205 int compareTo(Term term, BytesRef termBytesRef, int termIndex) throws IOException {
206 return compareTo(term, termBytesRef, termIndex, (PagedBytesDataInput) dataInput.clone(), new BytesRef());
210 * Compare the fields of the terms first, and if not equals return from
211 * compare. If equal compare terms.
214 * the term to compare.
216 * the position of the term in the input to compare
220 * @throws IOException
222 private int compareTo(Term term, BytesRef termBytesRef, int termIndex, PagedBytesDataInput input, BytesRef reuse) throws IOException {
223 // if term field does not equal mid's field index, then compare fields
224 // else if they are equal, compare term's string values...
225 int c = compareField(term, termIndex, input);
227 reuse.length = input.readVInt();
228 reuse.grow(reuse.length);
229 input.readBytes(reuse.bytes, 0, reuse.length);
230 return comparator.compare(termBytesRef, reuse);
236 * Compares the fields before checking the text of the terms.
241 * the term that exists in the data block.
245 * @throws IOException
247 private int compareField(Term term, int termIndex, PagedBytesDataInput input) throws IOException {
248 input.setPosition(indexToDataOffset.get(termIndex));
249 return term.field.compareTo(fields[input.readVInt()].field);