pylucene 3.5.0-3

[pylucene.git] / lucene-java-3.5.0 / lucene / src / java / org / apache / lucene / index / TermVectorsWriter.java
diff --git a/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/index/TermVectorsWriter.java b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/index/TermVectorsWriter.java

new file mode 100644 (file)

index 0000000..4f41e9d
--- /dev/null
+++ b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/index/TermVectorsWriter.java
@@ -0,0 +1,206 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.StringHelper;
+import org.apache.lucene.util.UnicodeUtil;
+
+import java.io.IOException;
+
+final class TermVectorsWriter {
+  
+  private IndexOutput tvx = null, tvd = null, tvf = null;
+  private FieldInfos fieldInfos;
+  final UnicodeUtil.UTF8Result[] utf8Results = new UnicodeUtil.UTF8Result[] {new UnicodeUtil.UTF8Result(),
+                                                                             new UnicodeUtil.UTF8Result()};
+
+  public TermVectorsWriter(Directory directory, String segment,
+                           FieldInfos fieldInfos) throws IOException {
+    boolean success = false;
+    try {
+      // Open files for TermVector storage
+      tvx = directory.createOutput(IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_INDEX_EXTENSION));
+      tvx.writeInt(TermVectorsReader.FORMAT_CURRENT);
+      tvd = directory.createOutput(IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_DOCUMENTS_EXTENSION));
+      tvd.writeInt(TermVectorsReader.FORMAT_CURRENT);
+      tvf = directory.createOutput(IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_FIELDS_EXTENSION));
+      tvf.writeInt(TermVectorsReader.FORMAT_CURRENT);
+      success = true;
+    } finally {
+      if (!success) {
+        IOUtils.closeWhileHandlingException(tvx, tvd, tvf);
+      }
+    }
+
+    this.fieldInfos = fieldInfos;
+  }
+
+  /**
+   * Add a complete document specified by all its term vectors. If document has no
+   * term vectors, add value for tvx.
+   * 
+   * @param vectors
+   * @throws IOException
+   */
+  public final void addAllDocVectors(TermFreqVector[] vectors) throws IOException {
+
+    tvx.writeLong(tvd.getFilePointer());
+    tvx.writeLong(tvf.getFilePointer());
+
+    if (vectors != null) {
+      final int numFields = vectors.length;
+      tvd.writeVInt(numFields);
+
+      long[] fieldPointers = new long[numFields];
+
+      for (int i=0; i<numFields; i++) {
+        fieldPointers[i] = tvf.getFilePointer();
+
+        final int fieldNumber = fieldInfos.fieldNumber(vectors[i].getField());
+
+        // 1st pass: write field numbers to tvd
+        tvd.writeVInt(fieldNumber);
+
+        final int numTerms = vectors[i].size();
+        tvf.writeVInt(numTerms);
+
+        final TermPositionVector tpVector;
+
+        final byte bits;
+        final boolean storePositions;
+        final boolean storeOffsets;
+
+        if (vectors[i] instanceof TermPositionVector) {
+          // May have positions & offsets
+          tpVector = (TermPositionVector) vectors[i];
+          storePositions = tpVector.size() > 0 && tpVector.getTermPositions(0) != null;
+          storeOffsets = tpVector.size() > 0 && tpVector.getOffsets(0) != null;
+          bits = (byte) ((storePositions ? TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR : 0) +
+                         (storeOffsets ? TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR : 0));
+        } else {
+          tpVector = null;
+          bits = 0;
+          storePositions = false;
+          storeOffsets = false;
+        }
+
+        tvf.writeVInt(bits);
+
+        final String[] terms = vectors[i].getTerms();
+        final int[] freqs = vectors[i].getTermFrequencies();
+
+        int utf8Upto = 0;
+        utf8Results[1].length = 0;
+
+        for (int j=0; j<numTerms; j++) {
+
+          UnicodeUtil.UTF16toUTF8(terms[j], 0, terms[j].length(), utf8Results[utf8Upto]);
+          
+          int start = StringHelper.bytesDifference(utf8Results[1-utf8Upto].result,
+                                                   utf8Results[1-utf8Upto].length,
+                                                   utf8Results[utf8Upto].result,
+                                                   utf8Results[utf8Upto].length);
+          int length = utf8Results[utf8Upto].length - start;
+          tvf.writeVInt(start);       // write shared prefix length
+          tvf.writeVInt(length);        // write delta length
+          tvf.writeBytes(utf8Results[utf8Upto].result, start, length);  // write delta bytes
+          utf8Upto = 1-utf8Upto;
+
+          final int termFreq = freqs[j];
+
+          tvf.writeVInt(termFreq);
+
+          if (storePositions) {
+            final int[] positions = tpVector.getTermPositions(j);
+            if (positions == null)
+              throw new IllegalStateException("Trying to write positions that are null!");
+            assert positions.length == termFreq;
+
+            // use delta encoding for positions
+            int lastPosition = 0;
+            for(int k=0;k<positions.length;k++) {
+              final int position = positions[k];
+              tvf.writeVInt(position-lastPosition);
+              lastPosition = position;
+            }
+          }
+
+          if (storeOffsets) {
+            final TermVectorOffsetInfo[] offsets = tpVector.getOffsets(j);
+            if (offsets == null)
+              throw new IllegalStateException("Trying to write offsets that are null!");
+            assert offsets.length == termFreq;
+
+            // use delta encoding for offsets
+            int lastEndOffset = 0;
+            for(int k=0;k<offsets.length;k++) {
+              final int startOffset = offsets[k].getStartOffset();
+              final int endOffset = offsets[k].getEndOffset();
+              tvf.writeVInt(startOffset-lastEndOffset);
+              tvf.writeVInt(endOffset-startOffset);
+              lastEndOffset = endOffset;
+            }
+          }
+        }
+      }
+
+      // 2nd pass: write field pointers to tvd
+      if (numFields > 1) {
+        long lastFieldPointer = fieldPointers[0];
+        for (int i=1; i<numFields; i++) {
+          final long fieldPointer = fieldPointers[i];
+          tvd.writeVLong(fieldPointer-lastFieldPointer);
+          lastFieldPointer = fieldPointer;
+        }
+      }
+    } else
+      tvd.writeVInt(0);
+  }
+
+  /**
+   * Do a bulk copy of numDocs documents from reader to our
+   * streams.  This is used to expedite merging, if the
+   * field numbers are congruent.
+   */
+  final void addRawDocuments(TermVectorsReader reader, int[] tvdLengths, int[] tvfLengths, int numDocs) throws IOException {
+    long tvdPosition = tvd.getFilePointer();
+    long tvfPosition = tvf.getFilePointer();
+    long tvdStart = tvdPosition;
+    long tvfStart = tvfPosition;
+    for(int i=0;i<numDocs;i++) {
+      tvx.writeLong(tvdPosition);
+      tvdPosition += tvdLengths[i];
+      tvx.writeLong(tvfPosition);
+      tvfPosition += tvfLengths[i];
+    }
+    tvd.copyBytes(reader.getTvdStream(), tvdPosition-tvdStart);
+    tvf.copyBytes(reader.getTvfStream(), tvfPosition-tvfStart);
+    assert tvd.getFilePointer() == tvdPosition;
+    assert tvf.getFilePointer() == tvfPosition;
+  }
+  
+  /** Close all streams. */
+  final void close() throws IOException {
+    // make an effort to close all streams we can but remember and re-throw
+    // the first exception encountered in this process
+    IOUtils.close(tvx, tvd, tvf);
+  }
+}