lucene-java-3.5.0/lucene/src/java/org/apache/lucene/index/TermVectorsReader.java

   1 package org.apache.lucene.index;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.Closeable;
  21 import java.io.IOException;
  22 import java.util.Arrays;
  23
  24 import org.apache.lucene.store.BufferedIndexInput;
  25 import org.apache.lucene.store.Directory;
  26 import org.apache.lucene.store.IndexInput;
  27 import org.apache.lucene.util.ArrayUtil;
  28 import org.apache.lucene.util.IOUtils;
  29
  30 class TermVectorsReader implements Cloneable, Closeable {
  31
  32   // NOTE: if you make a new format, it must be larger than
  33   // the current format
  34   static final int FORMAT_VERSION = 2;
  35
  36   // Changes to speed up bulk merging of term vectors:
  37   static final int FORMAT_VERSION2 = 3;
  38
  39   // Changed strings to UTF8 with length-in-bytes not length-in-chars
  40   static final int FORMAT_UTF8_LENGTH_IN_BYTES = 4;
  41
  42   // NOTE: always change this if you switch to a new format!
  43   static final int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES;
  44
  45   //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
  46   static final int FORMAT_SIZE = 4;
  47
  48   static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1;
  49   static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2;
  50
  51   private FieldInfos fieldInfos;
  52
  53   private IndexInput tvx;
  54   private IndexInput tvd;
  55   private IndexInput tvf;
  56   private int size;
  57   private int numTotalDocs;
  58
  59   // The docID offset where our docs begin in the index
  60   // file.  This will be 0 if we have our own private file.
  61   private int docStoreOffset;
  62
  63   private final int format;
  64
  65   TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos)
  66     throws CorruptIndexException, IOException {
  67     this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE);
  68   }
  69
  70   TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize)
  71     throws CorruptIndexException, IOException {
  72     this(d, segment, fieldInfos, readBufferSize, -1, 0);
  73   }
  74
  75   TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size)
  76     throws CorruptIndexException, IOException {
  77     boolean success = false;
  78
  79     try {
  80       String idxName = IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_INDEX_EXTENSION);
  81       tvx = d.openInput(idxName, readBufferSize);
  82       format = checkValidFormat(idxName, tvx);
  83       String fn = IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
  84       tvd = d.openInput(fn, readBufferSize);
  85       final int tvdFormat = checkValidFormat(fn, tvd);
  86       fn = IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_FIELDS_EXTENSION);
  87       tvf = d.openInput(fn, readBufferSize);
  88       final int tvfFormat = checkValidFormat(fn, tvf);
  89
  90       assert format == tvdFormat;
  91       assert format == tvfFormat;
  92
  93       if (format >= FORMAT_VERSION2) {
  94         numTotalDocs = (int) (tvx.length() >> 4);
  95       } else {
  96         assert (tvx.length()-FORMAT_SIZE) % 8 == 0;
  97         numTotalDocs = (int) (tvx.length() >> 3);
  98       }
  99
 100       if (-1 == docStoreOffset) {
 101         this.docStoreOffset = 0;
 102         this.size = numTotalDocs;
 103         assert size == 0 || numTotalDocs == size;
 104       } else {
 105         this.docStoreOffset = docStoreOffset;
 106         this.size = size;
 107         // Verify the file is long enough to hold all of our
 108         // docs
 109         assert numTotalDocs >= size + docStoreOffset: "numTotalDocs=" + numTotalDocs + " size=" + size + " docStoreOffset=" + docStoreOffset;
 110       }
 111
 112       this.fieldInfos = fieldInfos;
 113       success = true;
 114     } finally {
 115       // With lock-less commits, it's entirely possible (and
 116       // fine) to hit a FileNotFound exception above. In
 117       // this case, we want to explicitly close any subset
 118       // of things that were opened so that we don't have to
 119       // wait for a GC to do so.
 120       if (!success) {
 121         close();
 122       }
 123     }
 124   }
 125
 126   // Used for bulk copy when merging
 127   IndexInput getTvdStream() {
 128     return tvd;
 129   }
 130
 131   // Used for bulk copy when merging
 132   IndexInput getTvfStream() {
 133     return tvf;
 134   }
 135
 136   final private void seekTvx(final int docNum) throws IOException {
 137     if (format < FORMAT_VERSION2)
 138       tvx.seek((docNum + docStoreOffset) * 8L + FORMAT_SIZE);
 139     else
 140       tvx.seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE);
 141   }
 142
 143   boolean canReadRawDocs() {
 144     return format >= FORMAT_UTF8_LENGTH_IN_BYTES;
 145   }
 146
 147   /** Retrieve the length (in bytes) of the tvd and tvf
 148    *  entries for the next numDocs starting with
 149    *  startDocID.  This is used for bulk copying when
 150    *  merging segments, if the field numbers are
 151    *  congruent.  Once this returns, the tvf & tvd streams
 152    *  are seeked to the startDocID. */
 153   final void rawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs) throws IOException {
 154
 155     if (tvx == null) {
 156       Arrays.fill(tvdLengths, 0);
 157       Arrays.fill(tvfLengths, 0);
 158       return;
 159     }
 160
 161     // SegmentMerger calls canReadRawDocs() first and should
 162     // not call us if that returns false.
 163     if (format < FORMAT_VERSION2)
 164       throw new IllegalStateException("cannot read raw docs with older term vector formats");
 165
 166     seekTvx(startDocID);
 167
 168     long tvdPosition = tvx.readLong();
 169     tvd.seek(tvdPosition);
 170
 171     long tvfPosition = tvx.readLong();
 172     tvf.seek(tvfPosition);
 173
 174     long lastTvdPosition = tvdPosition;
 175     long lastTvfPosition = tvfPosition;
 176
 177     int count = 0;
 178     while (count < numDocs) {
 179       final int docID = docStoreOffset + startDocID + count + 1;
 180       assert docID <= numTotalDocs;
 181       if (docID < numTotalDocs)  {
 182         tvdPosition = tvx.readLong();
 183         tvfPosition = tvx.readLong();
 184       } else {
 185         tvdPosition = tvd.length();
 186         tvfPosition = tvf.length();
 187         assert count == numDocs-1;
 188       }
 189       tvdLengths[count] = (int) (tvdPosition-lastTvdPosition);
 190       tvfLengths[count] = (int) (tvfPosition-lastTvfPosition);
 191       count++;
 192       lastTvdPosition = tvdPosition;
 193       lastTvfPosition = tvfPosition;
 194     }
 195   }
 196
 197   private int checkValidFormat(String fn, IndexInput in) throws CorruptIndexException, IOException
 198   {
 199     int format = in.readInt();
 200     if (format > FORMAT_CURRENT) {
 201       throw new IndexFormatTooNewException(in, format, 1, FORMAT_CURRENT);
 202     }
 203     return format;
 204   }
 205
 206   public void close() throws IOException {
 207     IOUtils.close(tvx, tvd, tvf);
 208   }
 209
 210   /**
 211    *
 212    * @return The number of documents in the reader
 213    */
 214   int size() {
 215     return size;
 216   }
 217
 218   public void get(int docNum, String field, TermVectorMapper mapper) throws IOException {
 219     if (tvx != null) {
 220       int fieldNumber = fieldInfos.fieldNumber(field);
 221       //We need to account for the FORMAT_SIZE at when seeking in the tvx
 222       //We don't need to do this in other seeks because we already have the
 223       // file pointer
 224       //that was written in another file
 225       seekTvx(docNum);
 226       //System.out.println("TVX Pointer: " + tvx.getFilePointer());
 227       long tvdPosition = tvx.readLong();
 228
 229       tvd.seek(tvdPosition);
 230       int fieldCount = tvd.readVInt();
 231       //System.out.println("Num Fields: " + fieldCount);
 232       // There are only a few fields per document. We opt for a full scan
 233       // rather then requiring that they be ordered. We need to read through
 234       // all of the fields anyway to get to the tvf pointers.
 235       int number = 0;
 236       int found = -1;
 237       for (int i = 0; i < fieldCount; i++) {
 238         if (format >= FORMAT_VERSION)
 239           number = tvd.readVInt();
 240         else
 241           number += tvd.readVInt();
 242
 243         if (number == fieldNumber)
 244           found = i;
 245       }
 246
 247       // This field, although valid in the segment, was not found in this
 248       // document
 249       if (found != -1) {
 250         // Compute position in the tvf file
 251         long position;
 252         if (format >= FORMAT_VERSION2)
 253           position = tvx.readLong();
 254         else
 255           position = tvd.readVLong();
 256         for (int i = 1; i <= found; i++)
 257           position += tvd.readVLong();
 258
 259         mapper.setDocumentNumber(docNum);
 260         readTermVector(field, position, mapper);
 261       } else {
 262         //System.out.println("Fieldable not found");
 263       }
 264     } else {
 265       //System.out.println("No tvx file");
 266     }
 267   }
 268
 269
 270
 271   /**
 272    * Retrieve the term vector for the given document and field
 273    * @param docNum The document number to retrieve the vector for
 274    * @param field The field within the document to retrieve
 275    * @return The TermFreqVector for the document and field or null if there is no termVector for this field.
 276    * @throws IOException if there is an error reading the term vector files
 277    */
 278   TermFreqVector get(int docNum, String field) throws IOException {
 279     // Check if no term vectors are available for this segment at all
 280     ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
 281     get(docNum, field, mapper);
 282
 283     return mapper.materializeVector();
 284   }
 285
 286   // Reads the String[] fields; you have to pre-seek tvd to
 287   // the right point
 288   final private String[] readFields(int fieldCount) throws IOException {
 289     int number = 0;
 290     String[] fields = new String[fieldCount];
 291
 292     for (int i = 0; i < fieldCount; i++) {
 293       if (format >= FORMAT_VERSION)
 294         number = tvd.readVInt();
 295       else
 296         number += tvd.readVInt();
 297
 298       fields[i] = fieldInfos.fieldName(number);
 299     }
 300
 301     return fields;
 302   }
 303
 304   // Reads the long[] offsets into TVF; you have to pre-seek
 305   // tvx/tvd to the right point
 306   final private long[] readTvfPointers(int fieldCount) throws IOException {
 307     // Compute position in the tvf file
 308     long position;
 309     if (format >= FORMAT_VERSION2)
 310       position = tvx.readLong();
 311     else
 312       position = tvd.readVLong();
 313
 314     long[] tvfPointers = new long[fieldCount];
 315     tvfPointers[0] = position;
 316
 317     for (int i = 1; i < fieldCount; i++) {
 318       position += tvd.readVLong();
 319       tvfPointers[i] = position;
 320     }
 321
 322     return tvfPointers;
 323   }
 324
 325   /**
 326    * Return all term vectors stored for this document or null if the could not be read in.
 327    *
 328    * @param docNum The document number to retrieve the vector for
 329    * @return All term frequency vectors
 330    * @throws IOException if there is an error reading the term vector files
 331    */
 332   TermFreqVector[] get(int docNum) throws IOException {
 333     TermFreqVector[] result = null;
 334     if (tvx != null) {
 335       //We need to offset by
 336       seekTvx(docNum);
 337       long tvdPosition = tvx.readLong();
 338
 339       tvd.seek(tvdPosition);
 340       int fieldCount = tvd.readVInt();
 341
 342       // No fields are vectorized for this document
 343       if (fieldCount != 0) {
 344         final String[] fields = readFields(fieldCount);
 345         final long[] tvfPointers = readTvfPointers(fieldCount);
 346         result = readTermVectors(docNum, fields, tvfPointers);
 347       }
 348     } else {
 349       //System.out.println("No tvx file");
 350     }
 351     return result;
 352   }
 353
 354   public void get(int docNumber, TermVectorMapper mapper) throws IOException {
 355     // Check if no term vectors are available for this segment at all
 356     if (tvx != null) {
 357       //We need to offset by
 358
 359       seekTvx(docNumber);
 360       long tvdPosition = tvx.readLong();
 361
 362       tvd.seek(tvdPosition);
 363       int fieldCount = tvd.readVInt();
 364
 365       // No fields are vectorized for this document
 366       if (fieldCount != 0) {
 367         final String[] fields = readFields(fieldCount);
 368         final long[] tvfPointers = readTvfPointers(fieldCount);
 369         mapper.setDocumentNumber(docNumber);
 370         readTermVectors(fields, tvfPointers, mapper);
 371       }
 372     } else {
 373       //System.out.println("No tvx file");
 374     }
 375   }
 376
 377
 378   private SegmentTermVector[] readTermVectors(int docNum, String fields[], long tvfPointers[])
 379           throws IOException {
 380     SegmentTermVector res[] = new SegmentTermVector[fields.length];
 381     for (int i = 0; i < fields.length; i++) {
 382       ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
 383       mapper.setDocumentNumber(docNum);
 384       readTermVector(fields[i], tvfPointers[i], mapper);
 385       res[i] = (SegmentTermVector) mapper.materializeVector();
 386     }
 387     return res;
 388   }
 389
 390   private void readTermVectors(String fields[], long tvfPointers[], TermVectorMapper mapper)
 391           throws IOException {
 392     for (int i = 0; i < fields.length; i++) {
 393       readTermVector(fields[i], tvfPointers[i], mapper);
 394     }
 395   }
 396
 397
 398   /**
 399    *
 400    * @param field The field to read in
 401    * @param tvfPointer The pointer within the tvf file where we should start reading
 402    * @param mapper The mapper used to map the TermVector
 403    * @throws IOException
 404    */
 405   private void readTermVector(String field, long tvfPointer, TermVectorMapper mapper)
 406           throws IOException {
 407
 408     // Now read the data from specified position
 409     //We don't need to offset by the FORMAT here since the pointer already includes the offset
 410     tvf.seek(tvfPointer);
 411
 412     int numTerms = tvf.readVInt();
 413     //System.out.println("Num Terms: " + numTerms);
 414     // If no terms - return a constant empty termvector. However, this should never occur!
 415     if (numTerms == 0)
 416       return;
 417
 418     boolean storePositions;
 419     boolean storeOffsets;
 420
 421     if (format >= FORMAT_VERSION){
 422       byte bits = tvf.readByte();
 423       storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
 424       storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
 425     }
 426     else{
 427       tvf.readVInt();
 428       storePositions = false;
 429       storeOffsets = false;
 430     }
 431     mapper.setExpectations(field, numTerms, storeOffsets, storePositions);
 432     int start = 0;
 433     int deltaLength = 0;
 434     int totalLength = 0;
 435     byte[] byteBuffer;
 436     char[] charBuffer;
 437     final boolean preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES;
 438
 439     // init the buffers
 440     if (preUTF8) {
 441       charBuffer = new char[10];
 442       byteBuffer = null;
 443     } else {
 444       charBuffer = null;
 445       byteBuffer = new byte[20];
 446     }
 447
 448     for (int i = 0; i < numTerms; i++) {
 449       start = tvf.readVInt();
 450       deltaLength = tvf.readVInt();
 451       totalLength = start + deltaLength;
 452
 453       final String term;
 454
 455       if (preUTF8) {
 456         // Term stored as java chars
 457         if (charBuffer.length < totalLength) {
 458           charBuffer = ArrayUtil.grow(charBuffer, totalLength);
 459         }
 460         tvf.readChars(charBuffer, start, deltaLength);
 461         term = new String(charBuffer, 0, totalLength);
 462       } else {
 463         // Term stored as utf8 bytes
 464         if (byteBuffer.length < totalLength) {
 465           byteBuffer = ArrayUtil.grow(byteBuffer, totalLength);
 466         }
 467         tvf.readBytes(byteBuffer, start, deltaLength);
 468         term = new String(byteBuffer, 0, totalLength, "UTF-8");
 469       }
 470       int freq = tvf.readVInt();
 471       int [] positions = null;
 472       if (storePositions) { //read in the positions
 473         //does the mapper even care about positions?
 474         if (mapper.isIgnoringPositions() == false) {
 475           positions = new int[freq];
 476           int prevPosition = 0;
 477           for (int j = 0; j < freq; j++)
 478           {
 479             positions[j] = prevPosition + tvf.readVInt();
 480             prevPosition = positions[j];
 481           }
 482         } else {
 483           //we need to skip over the positions.  Since these are VInts, I don't believe there is anyway to know for sure how far to skip
 484           //
 485           for (int j = 0; j < freq; j++)
 486           {
 487             tvf.readVInt();
 488           }
 489         }
 490       }
 491       TermVectorOffsetInfo[] offsets = null;
 492       if (storeOffsets) {
 493         //does the mapper even care about offsets?
 494         if (mapper.isIgnoringOffsets() == false) {
 495           offsets = new TermVectorOffsetInfo[freq];
 496           int prevOffset = 0;
 497           for (int j = 0; j < freq; j++) {
 498             int startOffset = prevOffset + tvf.readVInt();
 499             int endOffset = startOffset + tvf.readVInt();
 500             offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset);
 501             prevOffset = endOffset;
 502           }
 503         } else {
 504           for (int j = 0; j < freq; j++){
 505             tvf.readVInt();
 506             tvf.readVInt();
 507           }
 508         }
 509       }
 510       mapper.map(term, freq, offsets, positions);
 511     }
 512   }
 513
 514   @Override
 515   protected Object clone() throws CloneNotSupportedException {
 516
 517     final TermVectorsReader clone = (TermVectorsReader) super.clone();
 518
 519     // These are null when a TermVectorsReader was created
 520     // on a segment that did not have term vectors saved
 521     if (tvx != null && tvd != null && tvf != null) {
 522       clone.tvx = (IndexInput) tvx.clone();
 523       clone.tvd = (IndexInput) tvd.clone();
 524       clone.tvf = (IndexInput) tvf.clone();
 525     }
 526
 527     return clone;
 528   }
 529 }
 530
 531
 532 /**
 533  * Models the existing parallel array structure
 534  */
 535 class ParallelArrayTermVectorMapper extends TermVectorMapper
 536 {
 537
 538   private String[] terms;
 539   private int[] termFreqs;
 540   private int positions[][];
 541   private TermVectorOffsetInfo offsets[][];
 542   private int currentPosition;
 543   private boolean storingOffsets;
 544   private boolean storingPositions;
 545   private String field;
 546
 547   @Override
 548   public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
 549     this.field = field;
 550     terms = new String[numTerms];
 551     termFreqs = new int[numTerms];
 552     this.storingOffsets = storeOffsets;
 553     this.storingPositions = storePositions;
 554     if(storePositions)
 555       this.positions = new int[numTerms][];
 556     if(storeOffsets)
 557       this.offsets = new TermVectorOffsetInfo[numTerms][];
 558   }
 559
 560   @Override
 561   public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
 562     terms[currentPosition] = term;
 563     termFreqs[currentPosition] = frequency;
 564     if (storingOffsets)
 565     {
 566       this.offsets[currentPosition] = offsets;
 567     }
 568     if (storingPositions)
 569     {
 570       this.positions[currentPosition] = positions;
 571     }
 572     currentPosition++;
 573   }
 574
 575   /**
 576    * Construct the vector
 577    * @return The {@link TermFreqVector} based on the mappings.
 578    */
 579   public TermFreqVector materializeVector() {
 580     SegmentTermVector tv = null;
 581     if (field != null && terms != null) {
 582       if (storingPositions || storingOffsets) {
 583         tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
 584       } else {
 585         tv = new SegmentTermVector(field, terms, termFreqs);
 586       }
 587     }
 588     return tv;
 589   }
 590 }