X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/index/TermInfosReaderIndex.java diff --git a/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/index/TermInfosReaderIndex.java b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/index/TermInfosReaderIndex.java new file mode 100644 index 0000000..4786bb5 --- /dev/null +++ b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/index/TermInfosReaderIndex.java @@ -0,0 +1,251 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; + +import org.apache.lucene.util.BitUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.PagedBytes.PagedBytesDataInput; +import org.apache.lucene.util.PagedBytes.PagedBytesDataOutput; +import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.packed.GrowableWriter; +import org.apache.lucene.util.packed.PackedInts; + +/** + * This stores a monotonically increasing set of pairs in an + * index segment. Pairs are accessed either by Term or by ordinal position the + * set. The Terms and TermInfo are actually serialized and stored into a byte + * array and pointers to the position of each are stored in a int array. + */ +class TermInfosReaderIndex { + + private static final int MAX_PAGE_BITS = 18; // 256 KB block + private Term[] fields; + private int totalIndexInterval; + private Comparator comparator = BytesRef.getUTF8SortedAsUTF16Comparator(); + private final PagedBytesDataInput dataInput; + private final PackedInts.Reader indexToDataOffset; + private final int indexSize; + private final int skipInterval; + + /** + * Loads the segment information at segment load time. + * + * @param indexEnum + * the term enum. + * @param indexDivisor + * the index divisor. + * @param tiiFileLength + * the size of the tii file, used to approximate the size of the + * buffer. + * @param totalIndexInterval + * the total index interval. + */ + TermInfosReaderIndex(SegmentTermEnum indexEnum, int indexDivisor, long tiiFileLength, int totalIndexInterval) throws IOException { + this.totalIndexInterval = totalIndexInterval; + indexSize = 1 + ((int) indexEnum.size - 1) / indexDivisor; + skipInterval = indexEnum.skipInterval; + // this is only an inital size, it will be GCed once the build is complete + long initialSize = (long) (tiiFileLength * 1.5) / indexDivisor; + PagedBytes dataPagedBytes = new PagedBytes(estimatePageBits(initialSize)); + PagedBytesDataOutput dataOutput = dataPagedBytes.getDataOutput(); + + GrowableWriter indexToTerms = new GrowableWriter(4, indexSize, false); + String currentField = null; + List fieldStrs = new ArrayList(); + int fieldCounter = -1; + for (int i = 0; indexEnum.next(); i++) { + Term term = indexEnum.term(); + if (currentField != term.field) { + currentField = term.field; + fieldStrs.add(currentField); + fieldCounter++; + } + TermInfo termInfo = indexEnum.termInfo(); + indexToTerms.set(i, dataOutput.getPosition()); + dataOutput.writeVInt(fieldCounter); + dataOutput.writeString(term.text()); + dataOutput.writeVInt(termInfo.docFreq); + if (termInfo.docFreq >= skipInterval) { + dataOutput.writeVInt(termInfo.skipOffset); + } + dataOutput.writeVLong(termInfo.freqPointer); + dataOutput.writeVLong(termInfo.proxPointer); + dataOutput.writeVLong(indexEnum.indexPointer); + for (int j = 1; j < indexDivisor; j++) { + if (!indexEnum.next()) { + break; + } + } + } + + fields = new Term[fieldStrs.size()]; + for (int i = 0; i < fields.length; i++) { + fields[i] = new Term(fieldStrs.get(i)); + } + + dataPagedBytes.freeze(true); + dataInput = dataPagedBytes.getDataInput(); + indexToDataOffset = indexToTerms.getMutable(); + } + + private static int estimatePageBits(long estSize) { + return Math.max(Math.min(64 - BitUtil.nlz(estSize), MAX_PAGE_BITS), 4); + } + + void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException { + PagedBytesDataInput input = (PagedBytesDataInput) dataInput.clone(); + + input.setPosition(indexToDataOffset.get(indexOffset)); + + // read the term + int fieldId = input.readVInt(); + Term field = fields[fieldId]; + Term term = field.createTerm(input.readString()); + + // read the terminfo + TermInfo termInfo = new TermInfo(); + termInfo.docFreq = input.readVInt(); + if (termInfo.docFreq >= skipInterval) { + termInfo.skipOffset = input.readVInt(); + } else { + termInfo.skipOffset = 0; + } + termInfo.freqPointer = input.readVLong(); + termInfo.proxPointer = input.readVLong(); + + long pointer = input.readVLong(); + + // perform the seek + enumerator.seek(pointer, ((long) indexOffset * totalIndexInterval) - 1, term, termInfo); + } + + /** + * Binary search for the given term. + * + * @param term + * the term to locate. + * @throws IOException + */ + int getIndexOffset(Term term, BytesRef termBytesRef) throws IOException { + int lo = 0; + int hi = indexSize - 1; + PagedBytesDataInput input = (PagedBytesDataInput) dataInput.clone(); + BytesRef scratch = new BytesRef(); + while (hi >= lo) { + int mid = (lo + hi) >>> 1; + int delta = compareTo(term, termBytesRef, mid, input, scratch); + if (delta < 0) + hi = mid - 1; + else if (delta > 0) + lo = mid + 1; + else + return mid; + } + return hi; + } + + /** + * Gets the term at the given position. For testing. + * + * @param termIndex + * the position to read the term from the index. + * @return the term. + * @throws IOException + */ + Term getTerm(int termIndex) throws IOException { + PagedBytesDataInput input = (PagedBytesDataInput) dataInput.clone(); + input.setPosition(indexToDataOffset.get(termIndex)); + + // read the term + int fieldId = input.readVInt(); + Term field = fields[fieldId]; + return field.createTerm(input.readString()); + } + + /** + * Returns the number of terms. + * + * @return int. + */ + int length() { + return indexSize; + } + + /** + * The compares the given term against the term in the index specified by the + * term index. ie It returns negative N when term is less than index term; + * + * @param term + * the given term. + * @param termIndex + * the index of the of term to compare. + * @return int. + * @throws IOException + */ + int compareTo(Term term, BytesRef termBytesRef, int termIndex) throws IOException { + return compareTo(term, termBytesRef, termIndex, (PagedBytesDataInput) dataInput.clone(), new BytesRef()); + } + + /** + * Compare the fields of the terms first, and if not equals return from + * compare. If equal compare terms. + * + * @param term + * the term to compare. + * @param termIndex + * the position of the term in the input to compare + * @param input + * the input buffer. + * @return int. + * @throws IOException + */ + private int compareTo(Term term, BytesRef termBytesRef, int termIndex, PagedBytesDataInput input, BytesRef reuse) throws IOException { + // if term field does not equal mid's field index, then compare fields + // else if they are equal, compare term's string values... + int c = compareField(term, termIndex, input); + if (c == 0) { + reuse.length = input.readVInt(); + reuse.grow(reuse.length); + input.readBytes(reuse.bytes, 0, reuse.length); + return comparator.compare(termBytesRef, reuse); + } + return c; + } + + /** + * Compares the fields before checking the text of the terms. + * + * @param term + * the given term. + * @param termIndex + * the term that exists in the data block. + * @param input + * the data block. + * @return int. + * @throws IOException + */ + private int compareField(Term term, int termIndex, PagedBytesDataInput input) throws IOException { + input.setPosition(indexToDataOffset.get(termIndex)); + return term.field.compareTo(fields[input.readVInt()].field); + } +}