1 package org.apache.lucene.index;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
22 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
23 import org.apache.lucene.document.Fieldable;
24 import org.apache.lucene.store.IndexOutput;
25 import org.apache.lucene.util.UnicodeUtil;
26 import org.apache.lucene.util.RamUsageEstimator;
28 final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
30 final TermVectorsTermsWriterPerThread perThread;
31 final TermsHashPerField termsHashPerField;
32 final TermVectorsTermsWriter termsWriter;
33 final FieldInfo fieldInfo;
34 final DocumentsWriter.DocState docState;
35 final FieldInvertState fieldState;
38 boolean doVectorPositions;
39 boolean doVectorOffsets;
42 OffsetAttribute offsetAttribute = null;
44 public TermVectorsTermsWriterPerField(TermsHashPerField termsHashPerField, TermVectorsTermsWriterPerThread perThread, FieldInfo fieldInfo) {
45 this.termsHashPerField = termsHashPerField;
46 this.perThread = perThread;
47 this.termsWriter = perThread.termsWriter;
48 this.fieldInfo = fieldInfo;
49 docState = termsHashPerField.docState;
50 fieldState = termsHashPerField.fieldState;
54 int getStreamCount() {
59 boolean start(Fieldable[] fields, int count) {
61 doVectorPositions = false;
62 doVectorOffsets = false;
64 for(int i=0;i<count;i++) {
65 Fieldable field = fields[i];
66 if (field.isIndexed() && field.isTermVectorStored()) {
68 doVectorPositions |= field.isStorePositionWithTermVector();
69 doVectorOffsets |= field.isStoreOffsetWithTermVector();
74 if (perThread.doc == null) {
75 perThread.doc = termsWriter.getPerDoc();
76 perThread.doc.docID = docState.docID;
77 assert perThread.doc.numVectorFields == 0;
78 assert 0 == perThread.doc.perDocTvf.length();
79 assert 0 == perThread.doc.perDocTvf.getFilePointer();
82 assert perThread.doc.docID == docState.docID;
84 if (termsHashPerField.numPostings != 0) {
85 // Only necessary if previous doc hit a
86 // non-aborting exception while writing vectors in
88 termsHashPerField.reset();
89 perThread.termsHashPerThread.reset(false);
93 // TODO: only if needed for performance
94 //perThread.postingsCount = 0;
99 public void abort() {}
101 /** Called once per field per document if term vectors
102 * are enabled, to write the vectors to
103 * RAMOutputStream, which is then quickly flushed to
104 * the real term vectors files in the Directory. */
106 void finish() throws IOException {
108 assert docState.testPoint("TermVectorsTermsWriterPerField.finish start");
110 final int numPostings = termsHashPerField.numPostings;
112 assert numPostings >= 0;
114 if (!doVectors || numPostings == 0)
117 if (numPostings > maxNumPostings)
118 maxNumPostings = numPostings;
120 final IndexOutput tvf = perThread.doc.perDocTvf;
122 // This is called once, after inverting all occurrences
123 // of a given field in the doc. At this point we flush
124 // our hash into the DocWriter.
126 assert fieldInfo.storeTermVector;
127 assert perThread.vectorFieldsInOrder(fieldInfo);
129 perThread.doc.addField(termsHashPerField.fieldInfo.number);
130 TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray;
132 final int[] termIDs = termsHashPerField.sortPostings();
134 tvf.writeVInt(numPostings);
136 if (doVectorPositions)
137 bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;
139 bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
143 int lastTermBytesCount = 0;
145 final ByteSliceReader reader = perThread.vectorSliceReader;
146 final char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers;
147 for(int j=0;j<numPostings;j++) {
148 final int termID = termIDs[j];
149 final int freq = postings.freqs[termID];
151 final char[] text2 = charBuffers[postings.textStarts[termID] >> DocumentsWriter.CHAR_BLOCK_SHIFT];
152 final int start2 = postings.textStarts[termID] & DocumentsWriter.CHAR_BLOCK_MASK;
154 // We swap between two encoders to save copying
155 // last Term's byte array
156 final UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto];
158 // TODO: we could do this incrementally
159 UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result);
160 final int termBytesCount = utf8Result.length;
162 // TODO: UTF16toUTF8 could tell us this prefix
163 // Compute common prefix between last term and
167 final byte[] lastTermBytes = perThread.utf8Results[1-encoderUpto].result;
168 final byte[] termBytes = perThread.utf8Results[encoderUpto].result;
169 while(prefix < lastTermBytesCount && prefix < termBytesCount) {
170 if (lastTermBytes[prefix] != termBytes[prefix])
175 encoderUpto = 1-encoderUpto;
176 lastTermBytesCount = termBytesCount;
178 final int suffix = termBytesCount - prefix;
179 tvf.writeVInt(prefix);
180 tvf.writeVInt(suffix);
181 tvf.writeBytes(utf8Result.result, prefix, suffix);
184 if (doVectorPositions) {
185 termsHashPerField.initReader(reader, termID, 0);
189 if (doVectorOffsets) {
190 termsHashPerField.initReader(reader, termID, 1);
195 termsHashPerField.reset();
197 // NOTE: we clear, per-field, at the thread level,
198 // because term vectors fully write themselves on each
199 // field; this saves RAM (eg if large doc has two large
200 // fields w/ term vectors on) because we recycle/reuse
201 // all RAM after each field:
202 perThread.termsHashPerThread.reset(false);
206 termsHashPerField.shrinkHash(maxNumPostings);
211 void start(Fieldable f) {
212 if (doVectorOffsets) {
213 offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class);
215 offsetAttribute = null;
220 void newTerm(final int termID) {
222 assert docState.testPoint("TermVectorsTermsWriterPerField.newTerm start");
224 TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray;
226 postings.freqs[termID] = 1;
228 if (doVectorOffsets) {
229 int startOffset = fieldState.offset + offsetAttribute.startOffset();
230 int endOffset = fieldState.offset + offsetAttribute.endOffset();
232 termsHashPerField.writeVInt(1, startOffset);
233 termsHashPerField.writeVInt(1, endOffset - startOffset);
234 postings.lastOffsets[termID] = endOffset;
237 if (doVectorPositions) {
238 termsHashPerField.writeVInt(0, fieldState.position);
239 postings.lastPositions[termID] = fieldState.position;
244 void addTerm(final int termID) {
246 assert docState.testPoint("TermVectorsTermsWriterPerField.addTerm start");
248 TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray;
250 postings.freqs[termID]++;
252 if (doVectorOffsets) {
253 int startOffset = fieldState.offset + offsetAttribute.startOffset();
254 int endOffset = fieldState.offset + offsetAttribute.endOffset();
256 termsHashPerField.writeVInt(1, startOffset - postings.lastOffsets[termID]);
257 termsHashPerField.writeVInt(1, endOffset - startOffset);
258 postings.lastOffsets[termID] = endOffset;
261 if (doVectorPositions) {
262 termsHashPerField.writeVInt(0, fieldState.position - postings.lastPositions[termID]);
263 postings.lastPositions[termID] = fieldState.position;
268 void skippingLongTerm() {}
271 ParallelPostingsArray createPostingsArray(int size) {
272 return new TermVectorsPostingsArray(size);
275 static final class TermVectorsPostingsArray extends ParallelPostingsArray {
276 public TermVectorsPostingsArray(int size) {
278 freqs = new int[size];
279 lastOffsets = new int[size];
280 lastPositions = new int[size];
283 int[] freqs; // How many times this term occurred in the current doc
284 int[] lastOffsets; // Last offset we saw
285 int[] lastPositions; // Last position where this term occurred
288 ParallelPostingsArray newInstance(int size) {
289 return new TermVectorsPostingsArray(size);
293 void copyTo(ParallelPostingsArray toArray, int numToCopy) {
294 assert toArray instanceof TermVectorsPostingsArray;
295 TermVectorsPostingsArray to = (TermVectorsPostingsArray) toArray;
297 super.copyTo(toArray, numToCopy);
299 System.arraycopy(freqs, 0, to.freqs, 0, size);
300 System.arraycopy(lastOffsets, 0, to.lastOffsets, 0, size);
301 System.arraycopy(lastPositions, 0, to.lastPositions, 0, size);
305 int bytesPerPosting() {
306 return super.bytesPerPosting() + 3 * RamUsageEstimator.NUM_BYTES_INT;