X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/index/TermVectorsReader.java?ds=sidebyside diff --git a/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/index/TermVectorsReader.java b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/index/TermVectorsReader.java new file mode 100644 index 0000000..8a5f876 --- /dev/null +++ b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/index/TermVectorsReader.java @@ -0,0 +1,590 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.store.BufferedIndexInput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.IOUtils; + +class TermVectorsReader implements Cloneable, Closeable { + + // NOTE: if you make a new format, it must be larger than + // the current format + static final int FORMAT_VERSION = 2; + + // Changes to speed up bulk merging of term vectors: + static final int FORMAT_VERSION2 = 3; + + // Changed strings to UTF8 with length-in-bytes not length-in-chars + static final int FORMAT_UTF8_LENGTH_IN_BYTES = 4; + + // NOTE: always change this if you switch to a new format! + static final int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES; + + //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file + static final int FORMAT_SIZE = 4; + + static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1; + static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2; + + private FieldInfos fieldInfos; + + private IndexInput tvx; + private IndexInput tvd; + private IndexInput tvf; + private int size; + private int numTotalDocs; + + // The docID offset where our docs begin in the index + // file. This will be 0 if we have our own private file. + private int docStoreOffset; + + private final int format; + + TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos) + throws CorruptIndexException, IOException { + this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE); + } + + TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize) + throws CorruptIndexException, IOException { + this(d, segment, fieldInfos, readBufferSize, -1, 0); + } + + TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size) + throws CorruptIndexException, IOException { + boolean success = false; + + try { + String idxName = IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_INDEX_EXTENSION); + tvx = d.openInput(idxName, readBufferSize); + format = checkValidFormat(idxName, tvx); + String fn = IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_DOCUMENTS_EXTENSION); + tvd = d.openInput(fn, readBufferSize); + final int tvdFormat = checkValidFormat(fn, tvd); + fn = IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_FIELDS_EXTENSION); + tvf = d.openInput(fn, readBufferSize); + final int tvfFormat = checkValidFormat(fn, tvf); + + assert format == tvdFormat; + assert format == tvfFormat; + + if (format >= FORMAT_VERSION2) { + numTotalDocs = (int) (tvx.length() >> 4); + } else { + assert (tvx.length()-FORMAT_SIZE) % 8 == 0; + numTotalDocs = (int) (tvx.length() >> 3); + } + + if (-1 == docStoreOffset) { + this.docStoreOffset = 0; + this.size = numTotalDocs; + assert size == 0 || numTotalDocs == size; + } else { + this.docStoreOffset = docStoreOffset; + this.size = size; + // Verify the file is long enough to hold all of our + // docs + assert numTotalDocs >= size + docStoreOffset: "numTotalDocs=" + numTotalDocs + " size=" + size + " docStoreOffset=" + docStoreOffset; + } + + this.fieldInfos = fieldInfos; + success = true; + } finally { + // With lock-less commits, it's entirely possible (and + // fine) to hit a FileNotFound exception above. In + // this case, we want to explicitly close any subset + // of things that were opened so that we don't have to + // wait for a GC to do so. + if (!success) { + close(); + } + } + } + + // Used for bulk copy when merging + IndexInput getTvdStream() { + return tvd; + } + + // Used for bulk copy when merging + IndexInput getTvfStream() { + return tvf; + } + + final private void seekTvx(final int docNum) throws IOException { + if (format < FORMAT_VERSION2) + tvx.seek((docNum + docStoreOffset) * 8L + FORMAT_SIZE); + else + tvx.seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE); + } + + boolean canReadRawDocs() { + return format >= FORMAT_UTF8_LENGTH_IN_BYTES; + } + + /** Retrieve the length (in bytes) of the tvd and tvf + * entries for the next numDocs starting with + * startDocID. This is used for bulk copying when + * merging segments, if the field numbers are + * congruent. Once this returns, the tvf & tvd streams + * are seeked to the startDocID. */ + final void rawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs) throws IOException { + + if (tvx == null) { + Arrays.fill(tvdLengths, 0); + Arrays.fill(tvfLengths, 0); + return; + } + + // SegmentMerger calls canReadRawDocs() first and should + // not call us if that returns false. + if (format < FORMAT_VERSION2) + throw new IllegalStateException("cannot read raw docs with older term vector formats"); + + seekTvx(startDocID); + + long tvdPosition = tvx.readLong(); + tvd.seek(tvdPosition); + + long tvfPosition = tvx.readLong(); + tvf.seek(tvfPosition); + + long lastTvdPosition = tvdPosition; + long lastTvfPosition = tvfPosition; + + int count = 0; + while (count < numDocs) { + final int docID = docStoreOffset + startDocID + count + 1; + assert docID <= numTotalDocs; + if (docID < numTotalDocs) { + tvdPosition = tvx.readLong(); + tvfPosition = tvx.readLong(); + } else { + tvdPosition = tvd.length(); + tvfPosition = tvf.length(); + assert count == numDocs-1; + } + tvdLengths[count] = (int) (tvdPosition-lastTvdPosition); + tvfLengths[count] = (int) (tvfPosition-lastTvfPosition); + count++; + lastTvdPosition = tvdPosition; + lastTvfPosition = tvfPosition; + } + } + + private int checkValidFormat(String fn, IndexInput in) throws CorruptIndexException, IOException + { + int format = in.readInt(); + if (format > FORMAT_CURRENT) { + throw new IndexFormatTooNewException(in, format, 1, FORMAT_CURRENT); + } + return format; + } + + public void close() throws IOException { + IOUtils.close(tvx, tvd, tvf); + } + + /** + * + * @return The number of documents in the reader + */ + int size() { + return size; + } + + public void get(int docNum, String field, TermVectorMapper mapper) throws IOException { + if (tvx != null) { + int fieldNumber = fieldInfos.fieldNumber(field); + //We need to account for the FORMAT_SIZE at when seeking in the tvx + //We don't need to do this in other seeks because we already have the + // file pointer + //that was written in another file + seekTvx(docNum); + //System.out.println("TVX Pointer: " + tvx.getFilePointer()); + long tvdPosition = tvx.readLong(); + + tvd.seek(tvdPosition); + int fieldCount = tvd.readVInt(); + //System.out.println("Num Fields: " + fieldCount); + // There are only a few fields per document. We opt for a full scan + // rather then requiring that they be ordered. We need to read through + // all of the fields anyway to get to the tvf pointers. + int number = 0; + int found = -1; + for (int i = 0; i < fieldCount; i++) { + if (format >= FORMAT_VERSION) + number = tvd.readVInt(); + else + number += tvd.readVInt(); + + if (number == fieldNumber) + found = i; + } + + // This field, although valid in the segment, was not found in this + // document + if (found != -1) { + // Compute position in the tvf file + long position; + if (format >= FORMAT_VERSION2) + position = tvx.readLong(); + else + position = tvd.readVLong(); + for (int i = 1; i <= found; i++) + position += tvd.readVLong(); + + mapper.setDocumentNumber(docNum); + readTermVector(field, position, mapper); + } else { + //System.out.println("Fieldable not found"); + } + } else { + //System.out.println("No tvx file"); + } + } + + + + /** + * Retrieve the term vector for the given document and field + * @param docNum The document number to retrieve the vector for + * @param field The field within the document to retrieve + * @return The TermFreqVector for the document and field or null if there is no termVector for this field. + * @throws IOException if there is an error reading the term vector files + */ + TermFreqVector get(int docNum, String field) throws IOException { + // Check if no term vectors are available for this segment at all + ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper(); + get(docNum, field, mapper); + + return mapper.materializeVector(); + } + + // Reads the String[] fields; you have to pre-seek tvd to + // the right point + final private String[] readFields(int fieldCount) throws IOException { + int number = 0; + String[] fields = new String[fieldCount]; + + for (int i = 0; i < fieldCount; i++) { + if (format >= FORMAT_VERSION) + number = tvd.readVInt(); + else + number += tvd.readVInt(); + + fields[i] = fieldInfos.fieldName(number); + } + + return fields; + } + + // Reads the long[] offsets into TVF; you have to pre-seek + // tvx/tvd to the right point + final private long[] readTvfPointers(int fieldCount) throws IOException { + // Compute position in the tvf file + long position; + if (format >= FORMAT_VERSION2) + position = tvx.readLong(); + else + position = tvd.readVLong(); + + long[] tvfPointers = new long[fieldCount]; + tvfPointers[0] = position; + + for (int i = 1; i < fieldCount; i++) { + position += tvd.readVLong(); + tvfPointers[i] = position; + } + + return tvfPointers; + } + + /** + * Return all term vectors stored for this document or null if the could not be read in. + * + * @param docNum The document number to retrieve the vector for + * @return All term frequency vectors + * @throws IOException if there is an error reading the term vector files + */ + TermFreqVector[] get(int docNum) throws IOException { + TermFreqVector[] result = null; + if (tvx != null) { + //We need to offset by + seekTvx(docNum); + long tvdPosition = tvx.readLong(); + + tvd.seek(tvdPosition); + int fieldCount = tvd.readVInt(); + + // No fields are vectorized for this document + if (fieldCount != 0) { + final String[] fields = readFields(fieldCount); + final long[] tvfPointers = readTvfPointers(fieldCount); + result = readTermVectors(docNum, fields, tvfPointers); + } + } else { + //System.out.println("No tvx file"); + } + return result; + } + + public void get(int docNumber, TermVectorMapper mapper) throws IOException { + // Check if no term vectors are available for this segment at all + if (tvx != null) { + //We need to offset by + + seekTvx(docNumber); + long tvdPosition = tvx.readLong(); + + tvd.seek(tvdPosition); + int fieldCount = tvd.readVInt(); + + // No fields are vectorized for this document + if (fieldCount != 0) { + final String[] fields = readFields(fieldCount); + final long[] tvfPointers = readTvfPointers(fieldCount); + mapper.setDocumentNumber(docNumber); + readTermVectors(fields, tvfPointers, mapper); + } + } else { + //System.out.println("No tvx file"); + } + } + + + private SegmentTermVector[] readTermVectors(int docNum, String fields[], long tvfPointers[]) + throws IOException { + SegmentTermVector res[] = new SegmentTermVector[fields.length]; + for (int i = 0; i < fields.length; i++) { + ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper(); + mapper.setDocumentNumber(docNum); + readTermVector(fields[i], tvfPointers[i], mapper); + res[i] = (SegmentTermVector) mapper.materializeVector(); + } + return res; + } + + private void readTermVectors(String fields[], long tvfPointers[], TermVectorMapper mapper) + throws IOException { + for (int i = 0; i < fields.length; i++) { + readTermVector(fields[i], tvfPointers[i], mapper); + } + } + + + /** + * + * @param field The field to read in + * @param tvfPointer The pointer within the tvf file where we should start reading + * @param mapper The mapper used to map the TermVector + * @throws IOException + */ + private void readTermVector(String field, long tvfPointer, TermVectorMapper mapper) + throws IOException { + + // Now read the data from specified position + //We don't need to offset by the FORMAT here since the pointer already includes the offset + tvf.seek(tvfPointer); + + int numTerms = tvf.readVInt(); + //System.out.println("Num Terms: " + numTerms); + // If no terms - return a constant empty termvector. However, this should never occur! + if (numTerms == 0) + return; + + boolean storePositions; + boolean storeOffsets; + + if (format >= FORMAT_VERSION){ + byte bits = tvf.readByte(); + storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; + storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; + } + else{ + tvf.readVInt(); + storePositions = false; + storeOffsets = false; + } + mapper.setExpectations(field, numTerms, storeOffsets, storePositions); + int start = 0; + int deltaLength = 0; + int totalLength = 0; + byte[] byteBuffer; + char[] charBuffer; + final boolean preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES; + + // init the buffers + if (preUTF8) { + charBuffer = new char[10]; + byteBuffer = null; + } else { + charBuffer = null; + byteBuffer = new byte[20]; + } + + for (int i = 0; i < numTerms; i++) { + start = tvf.readVInt(); + deltaLength = tvf.readVInt(); + totalLength = start + deltaLength; + + final String term; + + if (preUTF8) { + // Term stored as java chars + if (charBuffer.length < totalLength) { + charBuffer = ArrayUtil.grow(charBuffer, totalLength); + } + tvf.readChars(charBuffer, start, deltaLength); + term = new String(charBuffer, 0, totalLength); + } else { + // Term stored as utf8 bytes + if (byteBuffer.length < totalLength) { + byteBuffer = ArrayUtil.grow(byteBuffer, totalLength); + } + tvf.readBytes(byteBuffer, start, deltaLength); + term = new String(byteBuffer, 0, totalLength, "UTF-8"); + } + int freq = tvf.readVInt(); + int [] positions = null; + if (storePositions) { //read in the positions + //does the mapper even care about positions? + if (mapper.isIgnoringPositions() == false) { + positions = new int[freq]; + int prevPosition = 0; + for (int j = 0; j < freq; j++) + { + positions[j] = prevPosition + tvf.readVInt(); + prevPosition = positions[j]; + } + } else { + //we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip + // + for (int j = 0; j < freq; j++) + { + tvf.readVInt(); + } + } + } + TermVectorOffsetInfo[] offsets = null; + if (storeOffsets) { + //does the mapper even care about offsets? + if (mapper.isIgnoringOffsets() == false) { + offsets = new TermVectorOffsetInfo[freq]; + int prevOffset = 0; + for (int j = 0; j < freq; j++) { + int startOffset = prevOffset + tvf.readVInt(); + int endOffset = startOffset + tvf.readVInt(); + offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset); + prevOffset = endOffset; + } + } else { + for (int j = 0; j < freq; j++){ + tvf.readVInt(); + tvf.readVInt(); + } + } + } + mapper.map(term, freq, offsets, positions); + } + } + + @Override + protected Object clone() throws CloneNotSupportedException { + + final TermVectorsReader clone = (TermVectorsReader) super.clone(); + + // These are null when a TermVectorsReader was created + // on a segment that did not have term vectors saved + if (tvx != null && tvd != null && tvf != null) { + clone.tvx = (IndexInput) tvx.clone(); + clone.tvd = (IndexInput) tvd.clone(); + clone.tvf = (IndexInput) tvf.clone(); + } + + return clone; + } +} + + +/** + * Models the existing parallel array structure + */ +class ParallelArrayTermVectorMapper extends TermVectorMapper +{ + + private String[] terms; + private int[] termFreqs; + private int positions[][]; + private TermVectorOffsetInfo offsets[][]; + private int currentPosition; + private boolean storingOffsets; + private boolean storingPositions; + private String field; + + @Override + public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) { + this.field = field; + terms = new String[numTerms]; + termFreqs = new int[numTerms]; + this.storingOffsets = storeOffsets; + this.storingPositions = storePositions; + if(storePositions) + this.positions = new int[numTerms][]; + if(storeOffsets) + this.offsets = new TermVectorOffsetInfo[numTerms][]; + } + + @Override + public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { + terms[currentPosition] = term; + termFreqs[currentPosition] = frequency; + if (storingOffsets) + { + this.offsets[currentPosition] = offsets; + } + if (storingPositions) + { + this.positions[currentPosition] = positions; + } + currentPosition++; + } + + /** + * Construct the vector + * @return The {@link TermFreqVector} based on the mappings. + */ + public TermFreqVector materializeVector() { + SegmentTermVector tv = null; + if (field != null && terms != null) { + if (storingPositions || storingOffsets) { + tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets); + } else { + tv = new SegmentTermVector(field, terms, termFreqs); + } + } + return tv; + } +}