1 package org.apache.lucene.index;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import org.apache.lucene.store.Directory;
21 import org.apache.lucene.store.IndexOutput;
22 import org.apache.lucene.util.IOUtils;
23 import org.apache.lucene.util.StringHelper;
24 import org.apache.lucene.util.UnicodeUtil;
26 import java.io.IOException;
28 final class TermVectorsWriter {
30 private IndexOutput tvx = null, tvd = null, tvf = null;
31 private FieldInfos fieldInfos;
32 final UnicodeUtil.UTF8Result[] utf8Results = new UnicodeUtil.UTF8Result[] {new UnicodeUtil.UTF8Result(),
33 new UnicodeUtil.UTF8Result()};
35 public TermVectorsWriter(Directory directory, String segment,
36 FieldInfos fieldInfos) throws IOException {
37 boolean success = false;
39 // Open files for TermVector storage
40 tvx = directory.createOutput(IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_INDEX_EXTENSION));
41 tvx.writeInt(TermVectorsReader.FORMAT_CURRENT);
42 tvd = directory.createOutput(IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_DOCUMENTS_EXTENSION));
43 tvd.writeInt(TermVectorsReader.FORMAT_CURRENT);
44 tvf = directory.createOutput(IndexFileNames.segmentFileName(segment, IndexFileNames.VECTORS_FIELDS_EXTENSION));
45 tvf.writeInt(TermVectorsReader.FORMAT_CURRENT);
49 IOUtils.closeWhileHandlingException(tvx, tvd, tvf);
53 this.fieldInfos = fieldInfos;
57 * Add a complete document specified by all its term vectors. If document has no
58 * term vectors, add value for tvx.
63 public final void addAllDocVectors(TermFreqVector[] vectors) throws IOException {
65 tvx.writeLong(tvd.getFilePointer());
66 tvx.writeLong(tvf.getFilePointer());
68 if (vectors != null) {
69 final int numFields = vectors.length;
70 tvd.writeVInt(numFields);
72 long[] fieldPointers = new long[numFields];
74 for (int i=0; i<numFields; i++) {
75 fieldPointers[i] = tvf.getFilePointer();
77 final int fieldNumber = fieldInfos.fieldNumber(vectors[i].getField());
79 // 1st pass: write field numbers to tvd
80 tvd.writeVInt(fieldNumber);
82 final int numTerms = vectors[i].size();
83 tvf.writeVInt(numTerms);
85 final TermPositionVector tpVector;
88 final boolean storePositions;
89 final boolean storeOffsets;
91 if (vectors[i] instanceof TermPositionVector) {
92 // May have positions & offsets
93 tpVector = (TermPositionVector) vectors[i];
94 storePositions = tpVector.size() > 0 && tpVector.getTermPositions(0) != null;
95 storeOffsets = tpVector.size() > 0 && tpVector.getOffsets(0) != null;
96 bits = (byte) ((storePositions ? TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR : 0) +
97 (storeOffsets ? TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR : 0));
101 storePositions = false;
102 storeOffsets = false;
107 final String[] terms = vectors[i].getTerms();
108 final int[] freqs = vectors[i].getTermFrequencies();
111 utf8Results[1].length = 0;
113 for (int j=0; j<numTerms; j++) {
115 UnicodeUtil.UTF16toUTF8(terms[j], 0, terms[j].length(), utf8Results[utf8Upto]);
117 int start = StringHelper.bytesDifference(utf8Results[1-utf8Upto].result,
118 utf8Results[1-utf8Upto].length,
119 utf8Results[utf8Upto].result,
120 utf8Results[utf8Upto].length);
121 int length = utf8Results[utf8Upto].length - start;
122 tvf.writeVInt(start); // write shared prefix length
123 tvf.writeVInt(length); // write delta length
124 tvf.writeBytes(utf8Results[utf8Upto].result, start, length); // write delta bytes
125 utf8Upto = 1-utf8Upto;
127 final int termFreq = freqs[j];
129 tvf.writeVInt(termFreq);
131 if (storePositions) {
132 final int[] positions = tpVector.getTermPositions(j);
133 if (positions == null)
134 throw new IllegalStateException("Trying to write positions that are null!");
135 assert positions.length == termFreq;
137 // use delta encoding for positions
138 int lastPosition = 0;
139 for(int k=0;k<positions.length;k++) {
140 final int position = positions[k];
141 tvf.writeVInt(position-lastPosition);
142 lastPosition = position;
147 final TermVectorOffsetInfo[] offsets = tpVector.getOffsets(j);
149 throw new IllegalStateException("Trying to write offsets that are null!");
150 assert offsets.length == termFreq;
152 // use delta encoding for offsets
153 int lastEndOffset = 0;
154 for(int k=0;k<offsets.length;k++) {
155 final int startOffset = offsets[k].getStartOffset();
156 final int endOffset = offsets[k].getEndOffset();
157 tvf.writeVInt(startOffset-lastEndOffset);
158 tvf.writeVInt(endOffset-startOffset);
159 lastEndOffset = endOffset;
165 // 2nd pass: write field pointers to tvd
167 long lastFieldPointer = fieldPointers[0];
168 for (int i=1; i<numFields; i++) {
169 final long fieldPointer = fieldPointers[i];
170 tvd.writeVLong(fieldPointer-lastFieldPointer);
171 lastFieldPointer = fieldPointer;
179 * Do a bulk copy of numDocs documents from reader to our
180 * streams. This is used to expedite merging, if the
181 * field numbers are congruent.
183 final void addRawDocuments(TermVectorsReader reader, int[] tvdLengths, int[] tvfLengths, int numDocs) throws IOException {
184 long tvdPosition = tvd.getFilePointer();
185 long tvfPosition = tvf.getFilePointer();
186 long tvdStart = tvdPosition;
187 long tvfStart = tvfPosition;
188 for(int i=0;i<numDocs;i++) {
189 tvx.writeLong(tvdPosition);
190 tvdPosition += tvdLengths[i];
191 tvx.writeLong(tvfPosition);
192 tvfPosition += tvfLengths[i];
194 tvd.copyBytes(reader.getTvdStream(), tvdPosition-tvdStart);
195 tvf.copyBytes(reader.getTvfStream(), tvfPosition-tvfStart);
196 assert tvd.getFilePointer() == tvdPosition;
197 assert tvf.getFilePointer() == tvfPosition;
200 /** Close all streams. */
201 final void close() throws IOException {
202 // make an effort to close all streams we can but remember and re-throw
203 // the first exception encountered in this process
204 IOUtils.close(tvx, tvd, tvf);