+++ /dev/null
-package org.apache.lucene.index;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.Closeable;
-import java.io.IOException;
-import java.util.Arrays;
-
-import org.apache.lucene.store.BufferedIndexInput;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.IOUtils;
-
-class TermVectorsReader implements Cloneable, Closeable {
-
- // NOTE: if you make a new format, it must be larger than
- // the current format
- static final int FORMAT_VERSION = 2;
-
- // Changes to speed up bulk merging of term vectors:
- static final int FORMAT_VERSION2 = 3;
-
- // Changed strings to UTF8 with length-in-bytes not length-in-chars
- static final int FORMAT_UTF8_LENGTH_IN_BYTES = 4;
-
- // NOTE: always change this if you switch to a new format!
- static final int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES;
-
- //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
- static final int FORMAT_SIZE = 4;
-
- static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1;
- static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2;
-
- private FieldInfos fieldInfos;
-
- private IndexInput tvx;
- private IndexInput tvd;
- private IndexInput tvf;
- private int size;
- private int numTotalDocs;
-
- // The docID offset where our docs begin in the index
- // file. This will be 0 if we have our own private file.
- private int docStoreOffset;
-
- private final int format;
-
- TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos)
- throws CorruptIndexException, IOException {
- this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE);
- }
-
- TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize)
- throws CorruptIndexException, IOException {
- this(d, segment, fieldInfos, readBufferSize, -1, 0);
- }
-
- TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size)
- throws CorruptIndexException, IOException {
- boolean success = false;
-
- try {
- String idxName = IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_INDEX_EXTENSION);
- tvx = d.openInput(idxName, readBufferSize);
- format = checkValidFormat(tvx);
- tvd = d.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_DOCUMENTS_EXTENSION), readBufferSize);
- final int tvdFormat = checkValidFormat(tvd);
- tvf = d.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_FIELDS_EXTENSION), readBufferSize);
- final int tvfFormat = checkValidFormat(tvf);
-
- assert format == tvdFormat;
- assert format == tvfFormat;
-
- if (format >= FORMAT_VERSION2) {
- numTotalDocs = (int) (tvx.length() >> 4);
- } else {
- assert (tvx.length()-FORMAT_SIZE) % 8 == 0;
- numTotalDocs = (int) (tvx.length() >> 3);
- }
-
- if (-1 == docStoreOffset) {
- this.docStoreOffset = 0;
- this.size = numTotalDocs;
- assert size == 0 || numTotalDocs == size;
- } else {
- this.docStoreOffset = docStoreOffset;
- this.size = size;
- // Verify the file is long enough to hold all of our
- // docs
- assert numTotalDocs >= size + docStoreOffset: "numTotalDocs=" + numTotalDocs + " size=" + size + " docStoreOffset=" + docStoreOffset;
- }
-
- this.fieldInfos = fieldInfos;
- success = true;
- } finally {
- // With lock-less commits, it's entirely possible (and
- // fine) to hit a FileNotFound exception above. In
- // this case, we want to explicitly close any subset
- // of things that were opened so that we don't have to
- // wait for a GC to do so.
- if (!success) {
- close();
- }
- }
- }
-
- // Used for bulk copy when merging
- IndexInput getTvdStream() {
- return tvd;
- }
-
- // Used for bulk copy when merging
- IndexInput getTvfStream() {
- return tvf;
- }
-
- final private void seekTvx(final int docNum) throws IOException {
- if (format < FORMAT_VERSION2)
- tvx.seek((docNum + docStoreOffset) * 8L + FORMAT_SIZE);
- else
- tvx.seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE);
- }
-
- boolean canReadRawDocs() {
- return format >= FORMAT_UTF8_LENGTH_IN_BYTES;
- }
-
- /** Retrieve the length (in bytes) of the tvd and tvf
- * entries for the next numDocs starting with
- * startDocID. This is used for bulk copying when
- * merging segments, if the field numbers are
- * congruent. Once this returns, the tvf & tvd streams
- * are seeked to the startDocID. */
- final void rawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs) throws IOException {
-
- if (tvx == null) {
- Arrays.fill(tvdLengths, 0);
- Arrays.fill(tvfLengths, 0);
- return;
- }
-
- // SegmentMerger calls canReadRawDocs() first and should
- // not call us if that returns false.
- if (format < FORMAT_VERSION2)
- throw new IllegalStateException("cannot read raw docs with older term vector formats");
-
- seekTvx(startDocID);
-
- long tvdPosition = tvx.readLong();
- tvd.seek(tvdPosition);
-
- long tvfPosition = tvx.readLong();
- tvf.seek(tvfPosition);
-
- long lastTvdPosition = tvdPosition;
- long lastTvfPosition = tvfPosition;
-
- int count = 0;
- while (count < numDocs) {
- final int docID = docStoreOffset + startDocID + count + 1;
- assert docID <= numTotalDocs;
- if (docID < numTotalDocs) {
- tvdPosition = tvx.readLong();
- tvfPosition = tvx.readLong();
- } else {
- tvdPosition = tvd.length();
- tvfPosition = tvf.length();
- assert count == numDocs-1;
- }
- tvdLengths[count] = (int) (tvdPosition-lastTvdPosition);
- tvfLengths[count] = (int) (tvfPosition-lastTvfPosition);
- count++;
- lastTvdPosition = tvdPosition;
- lastTvfPosition = tvfPosition;
- }
- }
-
- private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException
- {
- int format = in.readInt();
- if (format > FORMAT_CURRENT) {
- throw new CorruptIndexException("Incompatible format version: " + format + " expected "
- + FORMAT_CURRENT + " or less");
- }
- return format;
- }
-
- public void close() throws IOException {
- IOUtils.close(tvx, tvd, tvf);
- }
-
- /**
- *
- * @return The number of documents in the reader
- */
- int size() {
- return size;
- }
-
- public void get(int docNum, String field, TermVectorMapper mapper) throws IOException {
- if (tvx != null) {
- int fieldNumber = fieldInfos.fieldNumber(field);
- //We need to account for the FORMAT_SIZE at when seeking in the tvx
- //We don't need to do this in other seeks because we already have the
- // file pointer
- //that was written in another file
- seekTvx(docNum);
- //System.out.println("TVX Pointer: " + tvx.getFilePointer());
- long tvdPosition = tvx.readLong();
-
- tvd.seek(tvdPosition);
- int fieldCount = tvd.readVInt();
- //System.out.println("Num Fields: " + fieldCount);
- // There are only a few fields per document. We opt for a full scan
- // rather then requiring that they be ordered. We need to read through
- // all of the fields anyway to get to the tvf pointers.
- int number = 0;
- int found = -1;
- for (int i = 0; i < fieldCount; i++) {
- if (format >= FORMAT_VERSION)
- number = tvd.readVInt();
- else
- number += tvd.readVInt();
-
- if (number == fieldNumber)
- found = i;
- }
-
- // This field, although valid in the segment, was not found in this
- // document
- if (found != -1) {
- // Compute position in the tvf file
- long position;
- if (format >= FORMAT_VERSION2)
- position = tvx.readLong();
- else
- position = tvd.readVLong();
- for (int i = 1; i <= found; i++)
- position += tvd.readVLong();
-
- mapper.setDocumentNumber(docNum);
- readTermVector(field, position, mapper);
- } else {
- //System.out.println("Fieldable not found");
- }
- } else {
- //System.out.println("No tvx file");
- }
- }
-
-
-
- /**
- * Retrieve the term vector for the given document and field
- * @param docNum The document number to retrieve the vector for
- * @param field The field within the document to retrieve
- * @return The TermFreqVector for the document and field or null if there is no termVector for this field.
- * @throws IOException if there is an error reading the term vector files
- */
- TermFreqVector get(int docNum, String field) throws IOException {
- // Check if no term vectors are available for this segment at all
- ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
- get(docNum, field, mapper);
-
- return mapper.materializeVector();
- }
-
- // Reads the String[] fields; you have to pre-seek tvd to
- // the right point
- final private String[] readFields(int fieldCount) throws IOException {
- int number = 0;
- String[] fields = new String[fieldCount];
-
- for (int i = 0; i < fieldCount; i++) {
- if (format >= FORMAT_VERSION)
- number = tvd.readVInt();
- else
- number += tvd.readVInt();
-
- fields[i] = fieldInfos.fieldName(number);
- }
-
- return fields;
- }
-
- // Reads the long[] offsets into TVF; you have to pre-seek
- // tvx/tvd to the right point
- final private long[] readTvfPointers(int fieldCount) throws IOException {
- // Compute position in the tvf file
- long position;
- if (format >= FORMAT_VERSION2)
- position = tvx.readLong();
- else
- position = tvd.readVLong();
-
- long[] tvfPointers = new long[fieldCount];
- tvfPointers[0] = position;
-
- for (int i = 1; i < fieldCount; i++) {
- position += tvd.readVLong();
- tvfPointers[i] = position;
- }
-
- return tvfPointers;
- }
-
- /**
- * Return all term vectors stored for this document or null if the could not be read in.
- *
- * @param docNum The document number to retrieve the vector for
- * @return All term frequency vectors
- * @throws IOException if there is an error reading the term vector files
- */
- TermFreqVector[] get(int docNum) throws IOException {
- TermFreqVector[] result = null;
- if (tvx != null) {
- //We need to offset by
- seekTvx(docNum);
- long tvdPosition = tvx.readLong();
-
- tvd.seek(tvdPosition);
- int fieldCount = tvd.readVInt();
-
- // No fields are vectorized for this document
- if (fieldCount != 0) {
- final String[] fields = readFields(fieldCount);
- final long[] tvfPointers = readTvfPointers(fieldCount);
- result = readTermVectors(docNum, fields, tvfPointers);
- }
- } else {
- //System.out.println("No tvx file");
- }
- return result;
- }
-
- public void get(int docNumber, TermVectorMapper mapper) throws IOException {
- // Check if no term vectors are available for this segment at all
- if (tvx != null) {
- //We need to offset by
-
- seekTvx(docNumber);
- long tvdPosition = tvx.readLong();
-
- tvd.seek(tvdPosition);
- int fieldCount = tvd.readVInt();
-
- // No fields are vectorized for this document
- if (fieldCount != 0) {
- final String[] fields = readFields(fieldCount);
- final long[] tvfPointers = readTvfPointers(fieldCount);
- mapper.setDocumentNumber(docNumber);
- readTermVectors(fields, tvfPointers, mapper);
- }
- } else {
- //System.out.println("No tvx file");
- }
- }
-
-
- private SegmentTermVector[] readTermVectors(int docNum, String fields[], long tvfPointers[])
- throws IOException {
- SegmentTermVector res[] = new SegmentTermVector[fields.length];
- for (int i = 0; i < fields.length; i++) {
- ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
- mapper.setDocumentNumber(docNum);
- readTermVector(fields[i], tvfPointers[i], mapper);
- res[i] = (SegmentTermVector) mapper.materializeVector();
- }
- return res;
- }
-
- private void readTermVectors(String fields[], long tvfPointers[], TermVectorMapper mapper)
- throws IOException {
- for (int i = 0; i < fields.length; i++) {
- readTermVector(fields[i], tvfPointers[i], mapper);
- }
- }
-
-
- /**
- *
- * @param field The field to read in
- * @param tvfPointer The pointer within the tvf file where we should start reading
- * @param mapper The mapper used to map the TermVector
- * @throws IOException
- */
- private void readTermVector(String field, long tvfPointer, TermVectorMapper mapper)
- throws IOException {
-
- // Now read the data from specified position
- //We don't need to offset by the FORMAT here since the pointer already includes the offset
- tvf.seek(tvfPointer);
-
- int numTerms = tvf.readVInt();
- //System.out.println("Num Terms: " + numTerms);
- // If no terms - return a constant empty termvector. However, this should never occur!
- if (numTerms == 0)
- return;
-
- boolean storePositions;
- boolean storeOffsets;
-
- if (format >= FORMAT_VERSION){
- byte bits = tvf.readByte();
- storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
- storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
- }
- else{
- tvf.readVInt();
- storePositions = false;
- storeOffsets = false;
- }
- mapper.setExpectations(field, numTerms, storeOffsets, storePositions);
- int start = 0;
- int deltaLength = 0;
- int totalLength = 0;
- byte[] byteBuffer;
- char[] charBuffer;
- final boolean preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES;
-
- // init the buffers
- if (preUTF8) {
- charBuffer = new char[10];
- byteBuffer = null;
- } else {
- charBuffer = null;
- byteBuffer = new byte[20];
- }
-
- for (int i = 0; i < numTerms; i++) {
- start = tvf.readVInt();
- deltaLength = tvf.readVInt();
- totalLength = start + deltaLength;
-
- final String term;
-
- if (preUTF8) {
- // Term stored as java chars
- if (charBuffer.length < totalLength) {
- charBuffer = ArrayUtil.grow(charBuffer, totalLength);
- }
- tvf.readChars(charBuffer, start, deltaLength);
- term = new String(charBuffer, 0, totalLength);
- } else {
- // Term stored as utf8 bytes
- if (byteBuffer.length < totalLength) {
- byteBuffer = ArrayUtil.grow(byteBuffer, totalLength);
- }
- tvf.readBytes(byteBuffer, start, deltaLength);
- term = new String(byteBuffer, 0, totalLength, "UTF-8");
- }
- int freq = tvf.readVInt();
- int [] positions = null;
- if (storePositions) { //read in the positions
- //does the mapper even care about positions?
- if (mapper.isIgnoringPositions() == false) {
- positions = new int[freq];
- int prevPosition = 0;
- for (int j = 0; j < freq; j++)
- {
- positions[j] = prevPosition + tvf.readVInt();
- prevPosition = positions[j];
- }
- } else {
- //we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip
- //
- for (int j = 0; j < freq; j++)
- {
- tvf.readVInt();
- }
- }
- }
- TermVectorOffsetInfo[] offsets = null;
- if (storeOffsets) {
- //does the mapper even care about offsets?
- if (mapper.isIgnoringOffsets() == false) {
- offsets = new TermVectorOffsetInfo[freq];
- int prevOffset = 0;
- for (int j = 0; j < freq; j++) {
- int startOffset = prevOffset + tvf.readVInt();
- int endOffset = startOffset + tvf.readVInt();
- offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset);
- prevOffset = endOffset;
- }
- } else {
- for (int j = 0; j < freq; j++){
- tvf.readVInt();
- tvf.readVInt();
- }
- }
- }
- mapper.map(term, freq, offsets, positions);
- }
- }
-
- @Override
- protected Object clone() throws CloneNotSupportedException {
-
- final TermVectorsReader clone = (TermVectorsReader) super.clone();
-
- // These are null when a TermVectorsReader was created
- // on a segment that did not have term vectors saved
- if (tvx != null && tvd != null && tvf != null) {
- clone.tvx = (IndexInput) tvx.clone();
- clone.tvd = (IndexInput) tvd.clone();
- clone.tvf = (IndexInput) tvf.clone();
- }
-
- return clone;
- }
-}
-
-
-/**
- * Models the existing parallel array structure
- */
-class ParallelArrayTermVectorMapper extends TermVectorMapper
-{
-
- private String[] terms;
- private int[] termFreqs;
- private int positions[][];
- private TermVectorOffsetInfo offsets[][];
- private int currentPosition;
- private boolean storingOffsets;
- private boolean storingPositions;
- private String field;
-
- @Override
- public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
- this.field = field;
- terms = new String[numTerms];
- termFreqs = new int[numTerms];
- this.storingOffsets = storeOffsets;
- this.storingPositions = storePositions;
- if(storePositions)
- this.positions = new int[numTerms][];
- if(storeOffsets)
- this.offsets = new TermVectorOffsetInfo[numTerms][];
- }
-
- @Override
- public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
- terms[currentPosition] = term;
- termFreqs[currentPosition] = frequency;
- if (storingOffsets)
- {
- this.offsets[currentPosition] = offsets;
- }
- if (storingPositions)
- {
- this.positions[currentPosition] = positions;
- }
- currentPosition++;
- }
-
- /**
- * Construct the vector
- * @return The {@link TermFreqVector} based on the mappings.
- */
- public TermFreqVector materializeVector() {
- SegmentTermVector tv = null;
- if (field != null && terms != null) {
- if (storingPositions || storingOffsets) {
- tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
- } else {
- tv = new SegmentTermVector(field, terms, termFreqs);
- }
- }
- return tv;
- }
-}