lucene-java-3.4.0/lucene/src/java/org/apache/lucene/index/TermVectorsReader.java

   1 package org.apache.lucene.index;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.Closeable;
  21 import java.io.IOException;
  22 import java.util.Arrays;
  23
  24 import org.apache.lucene.store.BufferedIndexInput;
  25 import org.apache.lucene.store.Directory;
  26 import org.apache.lucene.store.IndexInput;
  27 import org.apache.lucene.util.ArrayUtil;
  28 import org.apache.lucene.util.IOUtils;
  29
  30 class TermVectorsReader implements Cloneable, Closeable {
  31
  32   // NOTE: if you make a new format, it must be larger than
  33   // the current format
  34   static final int FORMAT_VERSION = 2;
  35
  36   // Changes to speed up bulk merging of term vectors:
  37   static final int FORMAT_VERSION2 = 3;
  38
  39   // Changed strings to UTF8 with length-in-bytes not length-in-chars
  40   static final int FORMAT_UTF8_LENGTH_IN_BYTES = 4;
  41
  42   // NOTE: always change this if you switch to a new format!
  43   static final int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES;
  44
  45   //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
  46   static final int FORMAT_SIZE = 4;
  47
  48   static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1;
  49   static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2;
  50
  51   private FieldInfos fieldInfos;
  52
  53   private IndexInput tvx;
  54   private IndexInput tvd;
  55   private IndexInput tvf;
  56   private int size;
  57   private int numTotalDocs;
  58
  59   // The docID offset where our docs begin in the index
  60   // file.  This will be 0 if we have our own private file.
  61   private int docStoreOffset;
  62
  63   private final int format;
  64
  65   TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos)
  66     throws CorruptIndexException, IOException {
  67     this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE);
  68   }
  69
  70   TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize)
  71     throws CorruptIndexException, IOException {
  72     this(d, segment, fieldInfos, readBufferSize, -1, 0);
  73   }
  74
  75   TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size)
  76     throws CorruptIndexException, IOException {
  77     boolean success = false;
  78
  79     try {
  80       String idxName = IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_INDEX_EXTENSION);
  81       tvx = d.openInput(idxName, readBufferSize);
  82       format = checkValidFormat(tvx);
  83       tvd = d.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_DOCUMENTS_EXTENSION), readBufferSize);
  84       final int tvdFormat = checkValidFormat(tvd);
  85       tvf = d.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_FIELDS_EXTENSION), readBufferSize);
  86       final int tvfFormat = checkValidFormat(tvf);
  87
  88       assert format == tvdFormat;
  89       assert format == tvfFormat;
  90
  91       if (format >= FORMAT_VERSION2) {
  92         numTotalDocs = (int) (tvx.length() >> 4);
  93       } else {
  94         assert (tvx.length()-FORMAT_SIZE) % 8 == 0;
  95         numTotalDocs = (int) (tvx.length() >> 3);
  96       }
  97
  98       if (-1 == docStoreOffset) {
  99         this.docStoreOffset = 0;
 100         this.size = numTotalDocs;
 101         assert size == 0 || numTotalDocs == size;
 102       } else {
 103         this.docStoreOffset = docStoreOffset;
 104         this.size = size;
 105         // Verify the file is long enough to hold all of our
 106         // docs
 107         assert numTotalDocs >= size + docStoreOffset: "numTotalDocs=" + numTotalDocs + " size=" + size + " docStoreOffset=" + docStoreOffset;
 108       }
 109
 110       this.fieldInfos = fieldInfos;
 111       success = true;
 112     } finally {
 113       // With lock-less commits, it's entirely possible (and
 114       // fine) to hit a FileNotFound exception above. In
 115       // this case, we want to explicitly close any subset
 116       // of things that were opened so that we don't have to
 117       // wait for a GC to do so.
 118       if (!success) {
 119         close();
 120       }
 121     }
 122   }
 123
 124   // Used for bulk copy when merging
 125   IndexInput getTvdStream() {
 126     return tvd;
 127   }
 128
 129   // Used for bulk copy when merging
 130   IndexInput getTvfStream() {
 131     return tvf;
 132   }
 133
 134   final private void seekTvx(final int docNum) throws IOException {
 135     if (format < FORMAT_VERSION2)
 136       tvx.seek((docNum + docStoreOffset) * 8L + FORMAT_SIZE);
 137     else
 138       tvx.seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE);
 139   }
 140
 141   boolean canReadRawDocs() {
 142     return format >= FORMAT_UTF8_LENGTH_IN_BYTES;
 143   }
 144
 145   /** Retrieve the length (in bytes) of the tvd and tvf
 146    *  entries for the next numDocs starting with
 147    *  startDocID.  This is used for bulk copying when
 148    *  merging segments, if the field numbers are
 149    *  congruent.  Once this returns, the tvf & tvd streams
 150    *  are seeked to the startDocID. */
 151   final void rawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs) throws IOException {
 152
 153     if (tvx == null) {
 154       Arrays.fill(tvdLengths, 0);
 155       Arrays.fill(tvfLengths, 0);
 156       return;
 157     }
 158
 159     // SegmentMerger calls canReadRawDocs() first and should
 160     // not call us if that returns false.
 161     if (format < FORMAT_VERSION2)
 162       throw new IllegalStateException("cannot read raw docs with older term vector formats");
 163
 164     seekTvx(startDocID);
 165
 166     long tvdPosition = tvx.readLong();
 167     tvd.seek(tvdPosition);
 168
 169     long tvfPosition = tvx.readLong();
 170     tvf.seek(tvfPosition);
 171
 172     long lastTvdPosition = tvdPosition;
 173     long lastTvfPosition = tvfPosition;
 174
 175     int count = 0;
 176     while (count < numDocs) {
 177       final int docID = docStoreOffset + startDocID + count + 1;
 178       assert docID <= numTotalDocs;
 179       if (docID < numTotalDocs)  {
 180         tvdPosition = tvx.readLong();
 181         tvfPosition = tvx.readLong();
 182       } else {
 183         tvdPosition = tvd.length();
 184         tvfPosition = tvf.length();
 185         assert count == numDocs-1;
 186       }
 187       tvdLengths[count] = (int) (tvdPosition-lastTvdPosition);
 188       tvfLengths[count] = (int) (tvfPosition-lastTvfPosition);
 189       count++;
 190       lastTvdPosition = tvdPosition;
 191       lastTvfPosition = tvfPosition;
 192     }
 193   }
 194
 195   private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException
 196   {
 197     int format = in.readInt();
 198     if (format > FORMAT_CURRENT) {
 199       throw new CorruptIndexException("Incompatible format version: " + format + " expected "
 200                                       + FORMAT_CURRENT + " or less");
 201     }
 202     return format;
 203   }
 204
 205   public void close() throws IOException {
 206     IOUtils.close(tvx, tvd, tvf);
 207   }
 208
 209   /**
 210    *
 211    * @return The number of documents in the reader
 212    */
 213   int size() {
 214     return size;
 215   }
 216
 217   public void get(int docNum, String field, TermVectorMapper mapper) throws IOException {
 218     if (tvx != null) {
 219       int fieldNumber = fieldInfos.fieldNumber(field);
 220       //We need to account for the FORMAT_SIZE at when seeking in the tvx
 221       //We don't need to do this in other seeks because we already have the
 222       // file pointer
 223       //that was written in another file
 224       seekTvx(docNum);
 225       //System.out.println("TVX Pointer: " + tvx.getFilePointer());
 226       long tvdPosition = tvx.readLong();
 227
 228       tvd.seek(tvdPosition);
 229       int fieldCount = tvd.readVInt();
 230       //System.out.println("Num Fields: " + fieldCount);
 231       // There are only a few fields per document. We opt for a full scan
 232       // rather then requiring that they be ordered. We need to read through
 233       // all of the fields anyway to get to the tvf pointers.
 234       int number = 0;
 235       int found = -1;
 236       for (int i = 0; i < fieldCount; i++) {
 237         if (format >= FORMAT_VERSION)
 238           number = tvd.readVInt();
 239         else
 240           number += tvd.readVInt();
 241
 242         if (number == fieldNumber)
 243           found = i;
 244       }
 245
 246       // This field, although valid in the segment, was not found in this
 247       // document
 248       if (found != -1) {
 249         // Compute position in the tvf file
 250         long position;
 251         if (format >= FORMAT_VERSION2)
 252           position = tvx.readLong();
 253         else
 254           position = tvd.readVLong();
 255         for (int i = 1; i <= found; i++)
 256           position += tvd.readVLong();
 257
 258         mapper.setDocumentNumber(docNum);
 259         readTermVector(field, position, mapper);
 260       } else {
 261         //System.out.println("Fieldable not found");
 262       }
 263     } else {
 264       //System.out.println("No tvx file");
 265     }
 266   }
 267
 268
 269
 270   /**
 271    * Retrieve the term vector for the given document and field
 272    * @param docNum The document number to retrieve the vector for
 273    * @param field The field within the document to retrieve
 274    * @return The TermFreqVector for the document and field or null if there is no termVector for this field.
 275    * @throws IOException if there is an error reading the term vector files
 276    */
 277   TermFreqVector get(int docNum, String field) throws IOException {
 278     // Check if no term vectors are available for this segment at all
 279     ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
 280     get(docNum, field, mapper);
 281
 282     return mapper.materializeVector();
 283   }
 284
 285   // Reads the String[] fields; you have to pre-seek tvd to
 286   // the right point
 287   final private String[] readFields(int fieldCount) throws IOException {
 288     int number = 0;
 289     String[] fields = new String[fieldCount];
 290
 291     for (int i = 0; i < fieldCount; i++) {
 292       if (format >= FORMAT_VERSION)
 293         number = tvd.readVInt();
 294       else
 295         number += tvd.readVInt();
 296
 297       fields[i] = fieldInfos.fieldName(number);
 298     }
 299
 300     return fields;
 301   }
 302
 303   // Reads the long[] offsets into TVF; you have to pre-seek
 304   // tvx/tvd to the right point
 305   final private long[] readTvfPointers(int fieldCount) throws IOException {
 306     // Compute position in the tvf file
 307     long position;
 308     if (format >= FORMAT_VERSION2)
 309       position = tvx.readLong();
 310     else
 311       position = tvd.readVLong();
 312
 313     long[] tvfPointers = new long[fieldCount];
 314     tvfPointers[0] = position;
 315
 316     for (int i = 1; i < fieldCount; i++) {
 317       position += tvd.readVLong();
 318       tvfPointers[i] = position;
 319     }
 320
 321     return tvfPointers;
 322   }
 323
 324   /**
 325    * Return all term vectors stored for this document or null if the could not be read in.
 326    *
 327    * @param docNum The document number to retrieve the vector for
 328    * @return All term frequency vectors
 329    * @throws IOException if there is an error reading the term vector files
 330    */
 331   TermFreqVector[] get(int docNum) throws IOException {
 332     TermFreqVector[] result = null;
 333     if (tvx != null) {
 334       //We need to offset by
 335       seekTvx(docNum);
 336       long tvdPosition = tvx.readLong();
 337
 338       tvd.seek(tvdPosition);
 339       int fieldCount = tvd.readVInt();
 340
 341       // No fields are vectorized for this document
 342       if (fieldCount != 0) {
 343         final String[] fields = readFields(fieldCount);
 344         final long[] tvfPointers = readTvfPointers(fieldCount);
 345         result = readTermVectors(docNum, fields, tvfPointers);
 346       }
 347     } else {
 348       //System.out.println("No tvx file");
 349     }
 350     return result;
 351   }
 352
 353   public void get(int docNumber, TermVectorMapper mapper) throws IOException {
 354     // Check if no term vectors are available for this segment at all
 355     if (tvx != null) {
 356       //We need to offset by
 357
 358       seekTvx(docNumber);
 359       long tvdPosition = tvx.readLong();
 360
 361       tvd.seek(tvdPosition);
 362       int fieldCount = tvd.readVInt();
 363
 364       // No fields are vectorized for this document
 365       if (fieldCount != 0) {
 366         final String[] fields = readFields(fieldCount);
 367         final long[] tvfPointers = readTvfPointers(fieldCount);
 368         mapper.setDocumentNumber(docNumber);
 369         readTermVectors(fields, tvfPointers, mapper);
 370       }
 371     } else {
 372       //System.out.println("No tvx file");
 373     }
 374   }
 375
 376
 377   private SegmentTermVector[] readTermVectors(int docNum, String fields[], long tvfPointers[])
 378           throws IOException {
 379     SegmentTermVector res[] = new SegmentTermVector[fields.length];
 380     for (int i = 0; i < fields.length; i++) {
 381       ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
 382       mapper.setDocumentNumber(docNum);
 383       readTermVector(fields[i], tvfPointers[i], mapper);
 384       res[i] = (SegmentTermVector) mapper.materializeVector();
 385     }
 386     return res;
 387   }
 388
 389   private void readTermVectors(String fields[], long tvfPointers[], TermVectorMapper mapper)
 390           throws IOException {
 391     for (int i = 0; i < fields.length; i++) {
 392       readTermVector(fields[i], tvfPointers[i], mapper);
 393     }
 394   }
 395
 396
 397   /**
 398    *
 399    * @param field The field to read in
 400    * @param tvfPointer The pointer within the tvf file where we should start reading
 401    * @param mapper The mapper used to map the TermVector
 402    * @throws IOException
 403    */
 404   private void readTermVector(String field, long tvfPointer, TermVectorMapper mapper)
 405           throws IOException {
 406
 407     // Now read the data from specified position
 408     //We don't need to offset by the FORMAT here since the pointer already includes the offset
 409     tvf.seek(tvfPointer);
 410
 411     int numTerms = tvf.readVInt();
 412     //System.out.println("Num Terms: " + numTerms);
 413     // If no terms - return a constant empty termvector. However, this should never occur!
 414     if (numTerms == 0)
 415       return;
 416
 417     boolean storePositions;
 418     boolean storeOffsets;
 419
 420     if (format >= FORMAT_VERSION){
 421       byte bits = tvf.readByte();
 422       storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
 423       storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
 424     }
 425     else{
 426       tvf.readVInt();
 427       storePositions = false;
 428       storeOffsets = false;
 429     }
 430     mapper.setExpectations(field, numTerms, storeOffsets, storePositions);
 431     int start = 0;
 432     int deltaLength = 0;
 433     int totalLength = 0;
 434     byte[] byteBuffer;
 435     char[] charBuffer;
 436     final boolean preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES;
 437
 438     // init the buffers
 439     if (preUTF8) {
 440       charBuffer = new char[10];
 441       byteBuffer = null;
 442     } else {
 443       charBuffer = null;
 444       byteBuffer = new byte[20];
 445     }
 446
 447     for (int i = 0; i < numTerms; i++) {
 448       start = tvf.readVInt();
 449       deltaLength = tvf.readVInt();
 450       totalLength = start + deltaLength;
 451
 452       final String term;
 453
 454       if (preUTF8) {
 455         // Term stored as java chars
 456         if (charBuffer.length < totalLength) {
 457           charBuffer = ArrayUtil.grow(charBuffer, totalLength);
 458         }
 459         tvf.readChars(charBuffer, start, deltaLength);
 460         term = new String(charBuffer, 0, totalLength);
 461       } else {
 462         // Term stored as utf8 bytes
 463         if (byteBuffer.length < totalLength) {
 464           byteBuffer = ArrayUtil.grow(byteBuffer, totalLength);
 465         }
 466         tvf.readBytes(byteBuffer, start, deltaLength);
 467         term = new String(byteBuffer, 0, totalLength, "UTF-8");
 468       }
 469       int freq = tvf.readVInt();
 470       int [] positions = null;
 471       if (storePositions) { //read in the positions
 472         //does the mapper even care about positions?
 473         if (mapper.isIgnoringPositions() == false) {
 474           positions = new int[freq];
 475           int prevPosition = 0;
 476           for (int j = 0; j < freq; j++)
 477           {
 478             positions[j] = prevPosition + tvf.readVInt();
 479             prevPosition = positions[j];
 480           }
 481         } else {
 482           //we need to skip over the positions.  Since these are VInts, I don't believe there is anyway to know for sure how far to skip
 483           //
 484           for (int j = 0; j < freq; j++)
 485           {
 486             tvf.readVInt();
 487           }
 488         }
 489       }
 490       TermVectorOffsetInfo[] offsets = null;
 491       if (storeOffsets) {
 492         //does the mapper even care about offsets?
 493         if (mapper.isIgnoringOffsets() == false) {
 494           offsets = new TermVectorOffsetInfo[freq];
 495           int prevOffset = 0;
 496           for (int j = 0; j < freq; j++) {
 497             int startOffset = prevOffset + tvf.readVInt();
 498             int endOffset = startOffset + tvf.readVInt();
 499             offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset);
 500             prevOffset = endOffset;
 501           }
 502         } else {
 503           for (int j = 0; j < freq; j++){
 504             tvf.readVInt();
 505             tvf.readVInt();
 506           }
 507         }
 508       }
 509       mapper.map(term, freq, offsets, positions);
 510     }
 511   }
 512
 513   @Override
 514   protected Object clone() throws CloneNotSupportedException {
 515
 516     final TermVectorsReader clone = (TermVectorsReader) super.clone();
 517
 518     // These are null when a TermVectorsReader was created
 519     // on a segment that did not have term vectors saved
 520     if (tvx != null && tvd != null && tvf != null) {
 521       clone.tvx = (IndexInput) tvx.clone();
 522       clone.tvd = (IndexInput) tvd.clone();
 523       clone.tvf = (IndexInput) tvf.clone();
 524     }
 525
 526     return clone;
 527   }
 528 }
 529
 530
 531 /**
 532  * Models the existing parallel array structure
 533  */
 534 class ParallelArrayTermVectorMapper extends TermVectorMapper
 535 {
 536
 537   private String[] terms;
 538   private int[] termFreqs;
 539   private int positions[][];
 540   private TermVectorOffsetInfo offsets[][];
 541   private int currentPosition;
 542   private boolean storingOffsets;
 543   private boolean storingPositions;
 544   private String field;
 545
 546   @Override
 547   public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
 548     this.field = field;
 549     terms = new String[numTerms];
 550     termFreqs = new int[numTerms];
 551     this.storingOffsets = storeOffsets;
 552     this.storingPositions = storePositions;
 553     if(storePositions)
 554       this.positions = new int[numTerms][];
 555     if(storeOffsets)
 556       this.offsets = new TermVectorOffsetInfo[numTerms][];
 557   }
 558
 559   @Override
 560   public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
 561     terms[currentPosition] = term;
 562     termFreqs[currentPosition] = frequency;
 563     if (storingOffsets)
 564     {
 565       this.offsets[currentPosition] = offsets;
 566     }
 567     if (storingPositions)
 568     {
 569       this.positions[currentPosition] = positions;
 570     }
 571     currentPosition++;
 572   }
 573
 574   /**
 575    * Construct the vector
 576    * @return The {@link TermFreqVector} based on the mappings.
 577    */
 578   public TermFreqVector materializeVector() {
 579     SegmentTermVector tv = null;
 580     if (field != null && terms != null) {
 581       if (storingPositions || storingOffsets) {
 582         tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
 583       } else {
 584         tv = new SegmentTermVector(field, terms, termFreqs);
 585       }
 586     }
 587     return tv;
 588   }
 589 }