--- /dev/null
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.document.Fieldable;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.RamUsageEstimator;
+
+final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
+
+ final TermVectorsTermsWriterPerThread perThread;
+ final TermsHashPerField termsHashPerField;
+ final TermVectorsTermsWriter termsWriter;
+ final FieldInfo fieldInfo;
+ final DocumentsWriter.DocState docState;
+ final FieldInvertState fieldState;
+
+ boolean doVectors;
+ boolean doVectorPositions;
+ boolean doVectorOffsets;
+
+ int maxNumPostings;
+ OffsetAttribute offsetAttribute = null;
+
+ public TermVectorsTermsWriterPerField(TermsHashPerField termsHashPerField, TermVectorsTermsWriterPerThread perThread, FieldInfo fieldInfo) {
+ this.termsHashPerField = termsHashPerField;
+ this.perThread = perThread;
+ this.termsWriter = perThread.termsWriter;
+ this.fieldInfo = fieldInfo;
+ docState = termsHashPerField.docState;
+ fieldState = termsHashPerField.fieldState;
+ }
+
+ @Override
+ int getStreamCount() {
+ return 2;
+ }
+
+ @Override
+ boolean start(Fieldable[] fields, int count) {
+ doVectors = false;
+ doVectorPositions = false;
+ doVectorOffsets = false;
+
+ for(int i=0;i<count;i++) {
+ Fieldable field = fields[i];
+ if (field.isIndexed() && field.isTermVectorStored()) {
+ doVectors = true;
+ doVectorPositions |= field.isStorePositionWithTermVector();
+ doVectorOffsets |= field.isStoreOffsetWithTermVector();
+ }
+ }
+
+ if (doVectors) {
+ if (perThread.doc == null) {
+ perThread.doc = termsWriter.getPerDoc();
+ perThread.doc.docID = docState.docID;
+ assert perThread.doc.numVectorFields == 0;
+ assert 0 == perThread.doc.perDocTvf.length();
+ assert 0 == perThread.doc.perDocTvf.getFilePointer();
+ }
+
+ assert perThread.doc.docID == docState.docID;
+
+ if (termsHashPerField.numPostings != 0) {
+ // Only necessary if previous doc hit a
+ // non-aborting exception while writing vectors in
+ // this field:
+ termsHashPerField.reset();
+ perThread.termsHashPerThread.reset(false);
+ }
+ }
+
+ // TODO: only if needed for performance
+ //perThread.postingsCount = 0;
+
+ return doVectors;
+ }
+
+ public void abort() {}
+
+ /** Called once per field per document if term vectors
+ * are enabled, to write the vectors to
+ * RAMOutputStream, which is then quickly flushed to
+ * the real term vectors files in the Directory. */
+ @Override
+ void finish() throws IOException {
+
+ assert docState.testPoint("TermVectorsTermsWriterPerField.finish start");
+
+ final int numPostings = termsHashPerField.numPostings;
+
+ assert numPostings >= 0;
+
+ if (!doVectors || numPostings == 0)
+ return;
+
+ if (numPostings > maxNumPostings)
+ maxNumPostings = numPostings;
+
+ final IndexOutput tvf = perThread.doc.perDocTvf;
+
+ // This is called once, after inverting all occurrences
+ // of a given field in the doc. At this point we flush
+ // our hash into the DocWriter.
+
+ assert fieldInfo.storeTermVector;
+ assert perThread.vectorFieldsInOrder(fieldInfo);
+
+ perThread.doc.addField(termsHashPerField.fieldInfo.number);
+ TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray;
+
+ final int[] termIDs = termsHashPerField.sortPostings();
+
+ tvf.writeVInt(numPostings);
+ byte bits = 0x0;
+ if (doVectorPositions)
+ bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;
+ if (doVectorOffsets)
+ bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
+ tvf.writeByte(bits);
+
+ int encoderUpto = 0;
+ int lastTermBytesCount = 0;
+
+ final ByteSliceReader reader = perThread.vectorSliceReader;
+ final char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers;
+ for(int j=0;j<numPostings;j++) {
+ final int termID = termIDs[j];
+ final int freq = postings.freqs[termID];
+
+ final char[] text2 = charBuffers[postings.textStarts[termID] >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+ final int start2 = postings.textStarts[termID] & DocumentsWriter.CHAR_BLOCK_MASK;
+
+ // We swap between two encoders to save copying
+ // last Term's byte array
+ final UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto];
+
+ // TODO: we could do this incrementally
+ UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result);
+ final int termBytesCount = utf8Result.length;
+
+ // TODO: UTF16toUTF8 could tell us this prefix
+ // Compute common prefix between last term and
+ // this term
+ int prefix = 0;
+ if (j > 0) {
+ final byte[] lastTermBytes = perThread.utf8Results[1-encoderUpto].result;
+ final byte[] termBytes = perThread.utf8Results[encoderUpto].result;
+ while(prefix < lastTermBytesCount && prefix < termBytesCount) {
+ if (lastTermBytes[prefix] != termBytes[prefix])
+ break;
+ prefix++;
+ }
+ }
+ encoderUpto = 1-encoderUpto;
+ lastTermBytesCount = termBytesCount;
+
+ final int suffix = termBytesCount - prefix;
+ tvf.writeVInt(prefix);
+ tvf.writeVInt(suffix);
+ tvf.writeBytes(utf8Result.result, prefix, suffix);
+ tvf.writeVInt(freq);
+
+ if (doVectorPositions) {
+ termsHashPerField.initReader(reader, termID, 0);
+ reader.writeTo(tvf);
+ }
+
+ if (doVectorOffsets) {
+ termsHashPerField.initReader(reader, termID, 1);
+ reader.writeTo(tvf);
+ }
+ }
+
+ termsHashPerField.reset();
+
+ // NOTE: we clear, per-field, at the thread level,
+ // because term vectors fully write themselves on each
+ // field; this saves RAM (eg if large doc has two large
+ // fields w/ term vectors on) because we recycle/reuse
+ // all RAM after each field:
+ perThread.termsHashPerThread.reset(false);
+ }
+
+ void shrinkHash() {
+ termsHashPerField.shrinkHash(maxNumPostings);
+ maxNumPostings = 0;
+ }
+
+ @Override
+ void start(Fieldable f) {
+ if (doVectorOffsets) {
+ offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class);
+ } else {
+ offsetAttribute = null;
+ }
+ }
+
+ @Override
+ void newTerm(final int termID) {
+
+ assert docState.testPoint("TermVectorsTermsWriterPerField.newTerm start");
+
+ TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray;
+
+ postings.freqs[termID] = 1;
+
+ if (doVectorOffsets) {
+ int startOffset = fieldState.offset + offsetAttribute.startOffset();
+ int endOffset = fieldState.offset + offsetAttribute.endOffset();
+
+ termsHashPerField.writeVInt(1, startOffset);
+ termsHashPerField.writeVInt(1, endOffset - startOffset);
+ postings.lastOffsets[termID] = endOffset;
+ }
+
+ if (doVectorPositions) {
+ termsHashPerField.writeVInt(0, fieldState.position);
+ postings.lastPositions[termID] = fieldState.position;
+ }
+ }
+
+ @Override
+ void addTerm(final int termID) {
+
+ assert docState.testPoint("TermVectorsTermsWriterPerField.addTerm start");
+
+ TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray;
+
+ postings.freqs[termID]++;
+
+ if (doVectorOffsets) {
+ int startOffset = fieldState.offset + offsetAttribute.startOffset();
+ int endOffset = fieldState.offset + offsetAttribute.endOffset();
+
+ termsHashPerField.writeVInt(1, startOffset - postings.lastOffsets[termID]);
+ termsHashPerField.writeVInt(1, endOffset - startOffset);
+ postings.lastOffsets[termID] = endOffset;
+ }
+
+ if (doVectorPositions) {
+ termsHashPerField.writeVInt(0, fieldState.position - postings.lastPositions[termID]);
+ postings.lastPositions[termID] = fieldState.position;
+ }
+ }
+
+ @Override
+ void skippingLongTerm() {}
+
+ @Override
+ ParallelPostingsArray createPostingsArray(int size) {
+ return new TermVectorsPostingsArray(size);
+ }
+
+ static final class TermVectorsPostingsArray extends ParallelPostingsArray {
+ public TermVectorsPostingsArray(int size) {
+ super(size);
+ freqs = new int[size];
+ lastOffsets = new int[size];
+ lastPositions = new int[size];
+ }
+
+ int[] freqs; // How many times this term occurred in the current doc
+ int[] lastOffsets; // Last offset we saw
+ int[] lastPositions; // Last position where this term occurred
+
+ @Override
+ ParallelPostingsArray newInstance(int size) {
+ return new TermVectorsPostingsArray(size);
+ }
+
+ @Override
+ void copyTo(ParallelPostingsArray toArray, int numToCopy) {
+ assert toArray instanceof TermVectorsPostingsArray;
+ TermVectorsPostingsArray to = (TermVectorsPostingsArray) toArray;
+
+ super.copyTo(toArray, numToCopy);
+
+ System.arraycopy(freqs, 0, to.freqs, 0, size);
+ System.arraycopy(lastOffsets, 0, to.lastOffsets, 0, size);
+ System.arraycopy(lastPositions, 0, to.lastPositions, 0, size);
+ }
+
+ @Override
+ int bytesPerPosting() {
+ return super.bytesPerPosting() + 3 * RamUsageEstimator.NUM_BYTES_INT;
+ }
+ }
+}