1 package org.apache.lucene.index;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.Closeable;
21 import java.io.IOException;
22 import java.util.Arrays;
24 import org.apache.lucene.store.BufferedIndexInput;
25 import org.apache.lucene.store.Directory;
26 import org.apache.lucene.store.IndexInput;
27 import org.apache.lucene.util.ArrayUtil;
28 import org.apache.lucene.util.IOUtils;
30 class TermVectorsReader implements Cloneable, Closeable {
32 // NOTE: if you make a new format, it must be larger than
34 static final int FORMAT_VERSION = 2;
36 // Changes to speed up bulk merging of term vectors:
37 static final int FORMAT_VERSION2 = 3;
39 // Changed strings to UTF8 with length-in-bytes not length-in-chars
40 static final int FORMAT_UTF8_LENGTH_IN_BYTES = 4;
42 // NOTE: always change this if you switch to a new format!
43 static final int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES;
45 //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
46 static final int FORMAT_SIZE = 4;
48 static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1;
49 static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2;
51 private FieldInfos fieldInfos;
53 private IndexInput tvx;
54 private IndexInput tvd;
55 private IndexInput tvf;
57 private int numTotalDocs;
59 // The docID offset where our docs begin in the index
60 // file. This will be 0 if we have our own private file.
61 private int docStoreOffset;
63 private final int format;
65 TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos)
66 throws CorruptIndexException, IOException {
67 this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE);
70 TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize)
71 throws CorruptIndexException, IOException {
72 this(d, segment, fieldInfos, readBufferSize, -1, 0);
75 TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size)
76 throws CorruptIndexException, IOException {
77 boolean success = false;
80 String idxName = IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_INDEX_EXTENSION);
81 tvx = d.openInput(idxName, readBufferSize);
82 format = checkValidFormat(idxName, tvx);
83 String fn = IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
84 tvd = d.openInput(fn, readBufferSize);
85 final int tvdFormat = checkValidFormat(fn, tvd);
86 fn = IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_FIELDS_EXTENSION);
87 tvf = d.openInput(fn, readBufferSize);
88 final int tvfFormat = checkValidFormat(fn, tvf);
90 assert format == tvdFormat;
91 assert format == tvfFormat;
93 if (format >= FORMAT_VERSION2) {
94 numTotalDocs = (int) (tvx.length() >> 4);
96 assert (tvx.length()-FORMAT_SIZE) % 8 == 0;
97 numTotalDocs = (int) (tvx.length() >> 3);
100 if (-1 == docStoreOffset) {
101 this.docStoreOffset = 0;
102 this.size = numTotalDocs;
103 assert size == 0 || numTotalDocs == size;
105 this.docStoreOffset = docStoreOffset;
107 // Verify the file is long enough to hold all of our
109 assert numTotalDocs >= size + docStoreOffset: "numTotalDocs=" + numTotalDocs + " size=" + size + " docStoreOffset=" + docStoreOffset;
112 this.fieldInfos = fieldInfos;
115 // With lock-less commits, it's entirely possible (and
116 // fine) to hit a FileNotFound exception above. In
117 // this case, we want to explicitly close any subset
118 // of things that were opened so that we don't have to
119 // wait for a GC to do so.
126 // Used for bulk copy when merging
127 IndexInput getTvdStream() {
131 // Used for bulk copy when merging
132 IndexInput getTvfStream() {
136 final private void seekTvx(final int docNum) throws IOException {
137 if (format < FORMAT_VERSION2)
138 tvx.seek((docNum + docStoreOffset) * 8L + FORMAT_SIZE);
140 tvx.seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE);
143 boolean canReadRawDocs() {
144 return format >= FORMAT_UTF8_LENGTH_IN_BYTES;
147 /** Retrieve the length (in bytes) of the tvd and tvf
148 * entries for the next numDocs starting with
149 * startDocID. This is used for bulk copying when
150 * merging segments, if the field numbers are
151 * congruent. Once this returns, the tvf & tvd streams
152 * are seeked to the startDocID. */
153 final void rawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs) throws IOException {
156 Arrays.fill(tvdLengths, 0);
157 Arrays.fill(tvfLengths, 0);
161 // SegmentMerger calls canReadRawDocs() first and should
162 // not call us if that returns false.
163 if (format < FORMAT_VERSION2)
164 throw new IllegalStateException("cannot read raw docs with older term vector formats");
168 long tvdPosition = tvx.readLong();
169 tvd.seek(tvdPosition);
171 long tvfPosition = tvx.readLong();
172 tvf.seek(tvfPosition);
174 long lastTvdPosition = tvdPosition;
175 long lastTvfPosition = tvfPosition;
178 while (count < numDocs) {
179 final int docID = docStoreOffset + startDocID + count + 1;
180 assert docID <= numTotalDocs;
181 if (docID < numTotalDocs) {
182 tvdPosition = tvx.readLong();
183 tvfPosition = tvx.readLong();
185 tvdPosition = tvd.length();
186 tvfPosition = tvf.length();
187 assert count == numDocs-1;
189 tvdLengths[count] = (int) (tvdPosition-lastTvdPosition);
190 tvfLengths[count] = (int) (tvfPosition-lastTvfPosition);
192 lastTvdPosition = tvdPosition;
193 lastTvfPosition = tvfPosition;
197 private int checkValidFormat(String fn, IndexInput in) throws CorruptIndexException, IOException
199 int format = in.readInt();
200 if (format > FORMAT_CURRENT) {
201 throw new IndexFormatTooNewException(in, format, 1, FORMAT_CURRENT);
206 public void close() throws IOException {
207 IOUtils.close(tvx, tvd, tvf);
212 * @return The number of documents in the reader
218 public void get(int docNum, String field, TermVectorMapper mapper) throws IOException {
220 int fieldNumber = fieldInfos.fieldNumber(field);
221 //We need to account for the FORMAT_SIZE at when seeking in the tvx
222 //We don't need to do this in other seeks because we already have the
224 //that was written in another file
226 //System.out.println("TVX Pointer: " + tvx.getFilePointer());
227 long tvdPosition = tvx.readLong();
229 tvd.seek(tvdPosition);
230 int fieldCount = tvd.readVInt();
231 //System.out.println("Num Fields: " + fieldCount);
232 // There are only a few fields per document. We opt for a full scan
233 // rather then requiring that they be ordered. We need to read through
234 // all of the fields anyway to get to the tvf pointers.
237 for (int i = 0; i < fieldCount; i++) {
238 if (format >= FORMAT_VERSION)
239 number = tvd.readVInt();
241 number += tvd.readVInt();
243 if (number == fieldNumber)
247 // This field, although valid in the segment, was not found in this
250 // Compute position in the tvf file
252 if (format >= FORMAT_VERSION2)
253 position = tvx.readLong();
255 position = tvd.readVLong();
256 for (int i = 1; i <= found; i++)
257 position += tvd.readVLong();
259 mapper.setDocumentNumber(docNum);
260 readTermVector(field, position, mapper);
262 //System.out.println("Fieldable not found");
265 //System.out.println("No tvx file");
272 * Retrieve the term vector for the given document and field
273 * @param docNum The document number to retrieve the vector for
274 * @param field The field within the document to retrieve
275 * @return The TermFreqVector for the document and field or null if there is no termVector for this field.
276 * @throws IOException if there is an error reading the term vector files
278 TermFreqVector get(int docNum, String field) throws IOException {
279 // Check if no term vectors are available for this segment at all
280 ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
281 get(docNum, field, mapper);
283 return mapper.materializeVector();
286 // Reads the String[] fields; you have to pre-seek tvd to
288 final private String[] readFields(int fieldCount) throws IOException {
290 String[] fields = new String[fieldCount];
292 for (int i = 0; i < fieldCount; i++) {
293 if (format >= FORMAT_VERSION)
294 number = tvd.readVInt();
296 number += tvd.readVInt();
298 fields[i] = fieldInfos.fieldName(number);
304 // Reads the long[] offsets into TVF; you have to pre-seek
305 // tvx/tvd to the right point
306 final private long[] readTvfPointers(int fieldCount) throws IOException {
307 // Compute position in the tvf file
309 if (format >= FORMAT_VERSION2)
310 position = tvx.readLong();
312 position = tvd.readVLong();
314 long[] tvfPointers = new long[fieldCount];
315 tvfPointers[0] = position;
317 for (int i = 1; i < fieldCount; i++) {
318 position += tvd.readVLong();
319 tvfPointers[i] = position;
326 * Return all term vectors stored for this document or null if the could not be read in.
328 * @param docNum The document number to retrieve the vector for
329 * @return All term frequency vectors
330 * @throws IOException if there is an error reading the term vector files
332 TermFreqVector[] get(int docNum) throws IOException {
333 TermFreqVector[] result = null;
335 //We need to offset by
337 long tvdPosition = tvx.readLong();
339 tvd.seek(tvdPosition);
340 int fieldCount = tvd.readVInt();
342 // No fields are vectorized for this document
343 if (fieldCount != 0) {
344 final String[] fields = readFields(fieldCount);
345 final long[] tvfPointers = readTvfPointers(fieldCount);
346 result = readTermVectors(docNum, fields, tvfPointers);
349 //System.out.println("No tvx file");
354 public void get(int docNumber, TermVectorMapper mapper) throws IOException {
355 // Check if no term vectors are available for this segment at all
357 //We need to offset by
360 long tvdPosition = tvx.readLong();
362 tvd.seek(tvdPosition);
363 int fieldCount = tvd.readVInt();
365 // No fields are vectorized for this document
366 if (fieldCount != 0) {
367 final String[] fields = readFields(fieldCount);
368 final long[] tvfPointers = readTvfPointers(fieldCount);
369 mapper.setDocumentNumber(docNumber);
370 readTermVectors(fields, tvfPointers, mapper);
373 //System.out.println("No tvx file");
378 private SegmentTermVector[] readTermVectors(int docNum, String fields[], long tvfPointers[])
380 SegmentTermVector res[] = new SegmentTermVector[fields.length];
381 for (int i = 0; i < fields.length; i++) {
382 ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
383 mapper.setDocumentNumber(docNum);
384 readTermVector(fields[i], tvfPointers[i], mapper);
385 res[i] = (SegmentTermVector) mapper.materializeVector();
390 private void readTermVectors(String fields[], long tvfPointers[], TermVectorMapper mapper)
392 for (int i = 0; i < fields.length; i++) {
393 readTermVector(fields[i], tvfPointers[i], mapper);
400 * @param field The field to read in
401 * @param tvfPointer The pointer within the tvf file where we should start reading
402 * @param mapper The mapper used to map the TermVector
403 * @throws IOException
405 private void readTermVector(String field, long tvfPointer, TermVectorMapper mapper)
408 // Now read the data from specified position
409 //We don't need to offset by the FORMAT here since the pointer already includes the offset
410 tvf.seek(tvfPointer);
412 int numTerms = tvf.readVInt();
413 //System.out.println("Num Terms: " + numTerms);
414 // If no terms - return a constant empty termvector. However, this should never occur!
418 boolean storePositions;
419 boolean storeOffsets;
421 if (format >= FORMAT_VERSION){
422 byte bits = tvf.readByte();
423 storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
424 storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
428 storePositions = false;
429 storeOffsets = false;
431 mapper.setExpectations(field, numTerms, storeOffsets, storePositions);
437 final boolean preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES;
441 charBuffer = new char[10];
445 byteBuffer = new byte[20];
448 for (int i = 0; i < numTerms; i++) {
449 start = tvf.readVInt();
450 deltaLength = tvf.readVInt();
451 totalLength = start + deltaLength;
456 // Term stored as java chars
457 if (charBuffer.length < totalLength) {
458 charBuffer = ArrayUtil.grow(charBuffer, totalLength);
460 tvf.readChars(charBuffer, start, deltaLength);
461 term = new String(charBuffer, 0, totalLength);
463 // Term stored as utf8 bytes
464 if (byteBuffer.length < totalLength) {
465 byteBuffer = ArrayUtil.grow(byteBuffer, totalLength);
467 tvf.readBytes(byteBuffer, start, deltaLength);
468 term = new String(byteBuffer, 0, totalLength, "UTF-8");
470 int freq = tvf.readVInt();
471 int [] positions = null;
472 if (storePositions) { //read in the positions
473 //does the mapper even care about positions?
474 if (mapper.isIgnoringPositions() == false) {
475 positions = new int[freq];
476 int prevPosition = 0;
477 for (int j = 0; j < freq; j++)
479 positions[j] = prevPosition + tvf.readVInt();
480 prevPosition = positions[j];
483 //we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip
485 for (int j = 0; j < freq; j++)
491 TermVectorOffsetInfo[] offsets = null;
493 //does the mapper even care about offsets?
494 if (mapper.isIgnoringOffsets() == false) {
495 offsets = new TermVectorOffsetInfo[freq];
497 for (int j = 0; j < freq; j++) {
498 int startOffset = prevOffset + tvf.readVInt();
499 int endOffset = startOffset + tvf.readVInt();
500 offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset);
501 prevOffset = endOffset;
504 for (int j = 0; j < freq; j++){
510 mapper.map(term, freq, offsets, positions);
515 protected Object clone() throws CloneNotSupportedException {
517 final TermVectorsReader clone = (TermVectorsReader) super.clone();
519 // These are null when a TermVectorsReader was created
520 // on a segment that did not have term vectors saved
521 if (tvx != null && tvd != null && tvf != null) {
522 clone.tvx = (IndexInput) tvx.clone();
523 clone.tvd = (IndexInput) tvd.clone();
524 clone.tvf = (IndexInput) tvf.clone();
533 * Models the existing parallel array structure
535 class ParallelArrayTermVectorMapper extends TermVectorMapper
538 private String[] terms;
539 private int[] termFreqs;
540 private int positions[][];
541 private TermVectorOffsetInfo offsets[][];
542 private int currentPosition;
543 private boolean storingOffsets;
544 private boolean storingPositions;
545 private String field;
548 public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
550 terms = new String[numTerms];
551 termFreqs = new int[numTerms];
552 this.storingOffsets = storeOffsets;
553 this.storingPositions = storePositions;
555 this.positions = new int[numTerms][];
557 this.offsets = new TermVectorOffsetInfo[numTerms][];
561 public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
562 terms[currentPosition] = term;
563 termFreqs[currentPosition] = frequency;
566 this.offsets[currentPosition] = offsets;
568 if (storingPositions)
570 this.positions[currentPosition] = positions;
576 * Construct the vector
577 * @return The {@link TermFreqVector} based on the mappings.
579 public TermFreqVector materializeVector() {
580 SegmentTermVector tv = null;
581 if (field != null && terms != null) {
582 if (storingPositions || storingOffsets) {
583 tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
585 tv = new SegmentTermVector(field, terms, termFreqs);