--- /dev/null
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.document.Fieldable;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.SorterTemplate;
+
+final class TermsHashPerField extends InvertedDocConsumerPerField {
+
+ final TermsHashConsumerPerField consumer;
+
+ final TermsHashPerField nextPerField;
+ final TermsHashPerThread perThread;
+ final DocumentsWriter.DocState docState;
+ final FieldInvertState fieldState;
+ CharTermAttribute termAtt;
+
+ // Copied from our perThread
+ final CharBlockPool charPool;
+ final IntBlockPool intPool;
+ final ByteBlockPool bytePool;
+
+ final int streamCount;
+ final int numPostingInt;
+
+ final FieldInfo fieldInfo;
+
+ boolean postingsCompacted;
+ int numPostings;
+ private int postingsHashSize = 4;
+ private int postingsHashHalfSize = postingsHashSize/2;
+ private int postingsHashMask = postingsHashSize-1;
+ private int[] postingsHash;
+
+ ParallelPostingsArray postingsArray;
+
+ public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) {
+ this.perThread = perThread;
+ intPool = perThread.intPool;
+ charPool = perThread.charPool;
+ bytePool = perThread.bytePool;
+ docState = perThread.docState;
+
+ postingsHash = new int[postingsHashSize];
+ Arrays.fill(postingsHash, -1);
+ bytesUsed(postingsHashSize * RamUsageEstimator.NUM_BYTES_INT);
+
+ fieldState = docInverterPerField.fieldState;
+ this.consumer = perThread.consumer.addField(this, fieldInfo);
+ initPostingsArray();
+
+ streamCount = consumer.getStreamCount();
+ numPostingInt = 2*streamCount;
+ this.fieldInfo = fieldInfo;
+ if (nextPerThread != null)
+ nextPerField = (TermsHashPerField) nextPerThread.addField(docInverterPerField, fieldInfo);
+ else
+ nextPerField = null;
+ }
+
+ private void initPostingsArray() {
+ postingsArray = consumer.createPostingsArray(2);
+ bytesUsed(postingsArray.size * postingsArray.bytesPerPosting());
+ }
+
+ // sugar: just forwards to DW
+ private void bytesUsed(long size) {
+ if (perThread.termsHash.trackAllocations) {
+ perThread.termsHash.docWriter.bytesUsed(size);
+ }
+ }
+
+ void shrinkHash(int targetSize) {
+ assert postingsCompacted || numPostings == 0;
+
+ final int newSize = 4;
+ if (newSize != postingsHash.length) {
+ final long previousSize = postingsHash.length;
+ postingsHash = new int[newSize];
+ bytesUsed((newSize-previousSize)*RamUsageEstimator.NUM_BYTES_INT);
+ Arrays.fill(postingsHash, -1);
+ postingsHashSize = newSize;
+ postingsHashHalfSize = newSize/2;
+ postingsHashMask = newSize-1;
+ }
+
+ // Fully free the postings array on each flush:
+ if (postingsArray != null) {
+ bytesUsed(-postingsArray.bytesPerPosting() * postingsArray.size);
+ postingsArray = null;
+ }
+ }
+
+ public void reset() {
+ if (!postingsCompacted)
+ compactPostings();
+ assert numPostings <= postingsHash.length;
+ if (numPostings > 0) {
+ Arrays.fill(postingsHash, 0, numPostings, -1);
+ numPostings = 0;
+ }
+ postingsCompacted = false;
+ if (nextPerField != null)
+ nextPerField.reset();
+ }
+
+ @Override
+ synchronized public void abort() {
+ reset();
+ if (nextPerField != null)
+ nextPerField.abort();
+ }
+
+ private final void growParallelPostingsArray() {
+ int oldSize = postingsArray.size;
+ this.postingsArray = this.postingsArray.grow();
+ bytesUsed(postingsArray.bytesPerPosting() * (postingsArray.size - oldSize));
+ }
+
+ public void initReader(ByteSliceReader reader, int termID, int stream) {
+ assert stream < streamCount;
+ int intStart = postingsArray.intStarts[termID];
+ final int[] ints = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
+ final int upto = intStart & DocumentsWriter.INT_BLOCK_MASK;
+ reader.init(bytePool,
+ postingsArray.byteStarts[termID]+stream*ByteBlockPool.FIRST_LEVEL_SIZE,
+ ints[upto+stream]);
+ }
+
+ private void compactPostings() {
+ int upto = 0;
+ for(int i=0;i<postingsHashSize;i++) {
+ if (postingsHash[i] != -1) {
+ if (upto < i) {
+ postingsHash[upto] = postingsHash[i];
+ postingsHash[i] = -1;
+ }
+ upto++;
+ }
+ }
+
+ assert upto == numPostings: "upto=" + upto + " numPostings=" + numPostings;
+ postingsCompacted = true;
+ }
+
+ /** Collapse the hash table & sort in-place. */
+ public int[] sortPostings() {
+ compactPostings();
+ final int[] postingsHash = this.postingsHash;
+ new SorterTemplate() {
+ @Override
+ protected void swap(int i, int j) {
+ final int o = postingsHash[i];
+ postingsHash[i] = postingsHash[j];
+ postingsHash[j] = o;
+ }
+
+ @Override
+ protected int compare(int i, int j) {
+ final int term1 = postingsHash[i], term2 = postingsHash[j];
+ if (term1 == term2)
+ return 0;
+ final int textStart1 = postingsArray.textStarts[term1],
+ textStart2 = postingsArray.textStarts[term2];
+ final char[] text1 = charPool.buffers[textStart1 >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+ final int pos1 = textStart1 & DocumentsWriter.CHAR_BLOCK_MASK;
+ final char[] text2 = charPool.buffers[textStart2 >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+ final int pos2 = textStart2 & DocumentsWriter.CHAR_BLOCK_MASK;
+ return comparePostings(text1, pos1, text2, pos2);
+ }
+
+ @Override
+ protected void setPivot(int i) {
+ pivotTerm = postingsHash[i];
+ final int textStart = postingsArray.textStarts[pivotTerm];
+ pivotBuf = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+ pivotBufPos = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
+ }
+
+ @Override
+ protected int comparePivot(int j) {
+ final int term = postingsHash[j];
+ if (pivotTerm == term)
+ return 0;
+ final int textStart = postingsArray.textStarts[term];
+ final char[] text = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+ final int pos = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
+ return comparePostings(pivotBuf, pivotBufPos, text, pos);
+ }
+
+ private int pivotTerm, pivotBufPos;
+ private char[] pivotBuf;
+
+ /** Compares term text for two Posting instance and
+ * returns -1 if p1 < p2; 1 if p1 > p2; else 0. */
+ private int comparePostings(final char[] text1, int pos1, final char[] text2, int pos2) {
+ assert text1 != text2 || pos1 != pos2;
+
+ while(true) {
+ final char c1 = text1[pos1++];
+ final char c2 = text2[pos2++];
+ if (c1 != c2) {
+ if (0xffff == c2)
+ return 1;
+ else if (0xffff == c1)
+ return -1;
+ else
+ return c1-c2;
+ } else
+ // This method should never compare equal postings
+ // unless p1==p2
+ assert c1 != 0xffff;
+ }
+ }
+ }.quickSort(0, numPostings-1);
+ return postingsHash;
+ }
+
+ /** Test whether the text for current RawPostingList p equals
+ * current tokenText. */
+ private boolean postingEquals(final int termID, final char[] tokenText, final int tokenTextLen) {
+ final int textStart = postingsArray.textStarts[termID];
+
+ final char[] text = perThread.charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+ assert text != null;
+ int pos = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
+
+ int tokenPos = 0;
+ for(;tokenPos<tokenTextLen;pos++,tokenPos++)
+ if (tokenText[tokenPos] != text[pos])
+ return false;
+ return 0xffff == text[pos];
+ }
+
+ private boolean doCall;
+ private boolean doNextCall;
+
+ @Override
+ void start(Fieldable f) {
+ termAtt = fieldState.attributeSource.addAttribute(CharTermAttribute.class);
+ consumer.start(f);
+ if (nextPerField != null) {
+ nextPerField.start(f);
+ }
+ }
+
+ @Override
+ boolean start(Fieldable[] fields, int count) throws IOException {
+ doCall = consumer.start(fields, count);
+ if (postingsArray == null) {
+ initPostingsArray();
+ }
+
+ if (nextPerField != null)
+ doNextCall = nextPerField.start(fields, count);
+ return doCall || doNextCall;
+ }
+
+ // Secondary entry point (for 2nd & subsequent TermsHash),
+ // because token text has already been "interned" into
+ // textStart, so we hash by textStart
+ public void add(int textStart) throws IOException {
+ int code = textStart;
+
+ int hashPos = code & postingsHashMask;
+
+ assert !postingsCompacted;
+
+ // Locate RawPostingList in hash
+ int termID = postingsHash[hashPos];
+
+ if (termID != -1 && postingsArray.textStarts[termID] != textStart) {
+ // Conflict: keep searching different locations in
+ // the hash table.
+ final int inc = ((code>>8)+code)|1;
+ do {
+ code += inc;
+ hashPos = code & postingsHashMask;
+ termID = postingsHash[hashPos];
+ } while (termID != -1 && postingsArray.textStarts[termID] != textStart);
+ }
+
+ if (termID == -1) {
+
+ // First time we are seeing this token since we last
+ // flushed the hash.
+
+ // New posting
+ termID = numPostings++;
+ if (termID >= postingsArray.size) {
+ growParallelPostingsArray();
+ }
+
+ assert termID >= 0;
+
+ postingsArray.textStarts[termID] = textStart;
+
+ assert postingsHash[hashPos] == -1;
+ postingsHash[hashPos] = termID;
+
+ if (numPostings == postingsHashHalfSize)
+ rehashPostings(2*postingsHashSize);
+
+ // Init stream slices
+ if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
+ intPool.nextBuffer();
+
+ if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE)
+ bytePool.nextBuffer();
+
+ intUptos = intPool.buffer;
+ intUptoStart = intPool.intUpto;
+ intPool.intUpto += streamCount;
+
+ postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;
+
+ for(int i=0;i<streamCount;i++) {
+ final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
+ intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
+ }
+ postingsArray.byteStarts[termID] = intUptos[intUptoStart];
+
+ consumer.newTerm(termID);
+
+ } else {
+ int intStart = postingsArray.intStarts[termID];
+ intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
+ intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK;
+ consumer.addTerm(termID);
+ }
+ }
+
+ // Primary entry point (for first TermsHash)
+ @Override
+ void add() throws IOException {
+
+ assert !postingsCompacted;
+
+ // We are first in the chain so we must "intern" the
+ // term text into textStart address
+
+ // Get the text of this term.
+ final char[] tokenText = termAtt.buffer();
+ final int tokenTextLen = termAtt.length();
+
+ // Compute hashcode & replace any invalid UTF16 sequences
+ int downto = tokenTextLen;
+ int code = 0;
+ while (downto > 0) {
+ char ch = tokenText[--downto];
+
+ if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END) {
+ if (0 == downto) {
+ // Unpaired
+ ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
+ } else {
+ final char ch2 = tokenText[downto-1];
+ if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END) {
+ // OK: high followed by low. This is a valid
+ // surrogate pair.
+ code = ((code*31) + ch)*31+ch2;
+ downto--;
+ continue;
+ } else {
+ // Unpaired
+ ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
+ }
+ }
+ } else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && (ch <= UnicodeUtil.UNI_SUR_HIGH_END ||
+ ch == 0xffff)) {
+ // Unpaired or 0xffff
+ ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
+ }
+
+ code = (code*31) + ch;
+ }
+
+ int hashPos = code & postingsHashMask;
+
+ // Locate RawPostingList in hash
+ int termID = postingsHash[hashPos];
+
+ if (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen)) {
+ // Conflict: keep searching different locations in
+ // the hash table.
+ final int inc = ((code>>8)+code)|1;
+ do {
+ code += inc;
+ hashPos = code & postingsHashMask;
+ termID = postingsHash[hashPos];
+ } while (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen));
+ }
+
+ if (termID == -1) {
+
+ // First time we are seeing this token since we last
+ // flushed the hash.
+ final int textLen1 = 1+tokenTextLen;
+ if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE) {
+ if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE) {
+ // Just skip this term, to remain as robust as
+ // possible during indexing. A TokenFilter
+ // can be inserted into the analyzer chain if
+ // other behavior is wanted (pruning the term
+ // to a prefix, throwing an exception, etc).
+
+ if (docState.maxTermPrefix == null)
+ docState.maxTermPrefix = new String(tokenText, 0, 30);
+
+ consumer.skippingLongTerm();
+ return;
+ }
+ charPool.nextBuffer();
+ }
+
+ // New posting
+ termID = numPostings++;
+ if (termID >= postingsArray.size) {
+ growParallelPostingsArray();
+ }
+
+ assert termID != -1;
+
+ final char[] text = charPool.buffer;
+ final int textUpto = charPool.charUpto;
+ postingsArray.textStarts[termID] = textUpto + charPool.charOffset;
+ charPool.charUpto += textLen1;
+ System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen);
+ text[textUpto+tokenTextLen] = 0xffff;
+
+ assert postingsHash[hashPos] == -1;
+ postingsHash[hashPos] = termID;
+
+ if (numPostings == postingsHashHalfSize) {
+ rehashPostings(2*postingsHashSize);
+ bytesUsed(2*numPostings * RamUsageEstimator.NUM_BYTES_INT);
+ }
+
+ // Init stream slices
+ if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
+ intPool.nextBuffer();
+
+ if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE)
+ bytePool.nextBuffer();
+
+ intUptos = intPool.buffer;
+ intUptoStart = intPool.intUpto;
+ intPool.intUpto += streamCount;
+
+ postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;
+
+ for(int i=0;i<streamCount;i++) {
+ final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
+ intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
+ }
+ postingsArray.byteStarts[termID] = intUptos[intUptoStart];
+
+ consumer.newTerm(termID);
+
+ } else {
+ final int intStart = postingsArray.intStarts[termID];
+ intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
+ intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK;
+ consumer.addTerm(termID);
+ }
+
+ if (doNextCall)
+ nextPerField.add(postingsArray.textStarts[termID]);
+ }
+
+ int[] intUptos;
+ int intUptoStart;
+
+ void writeByte(int stream, byte b) {
+ int upto = intUptos[intUptoStart+stream];
+ byte[] bytes = bytePool.buffers[upto >> DocumentsWriter.BYTE_BLOCK_SHIFT];
+ assert bytes != null;
+ int offset = upto & DocumentsWriter.BYTE_BLOCK_MASK;
+ if (bytes[offset] != 0) {
+ // End of slice; allocate a new one
+ offset = bytePool.allocSlice(bytes, offset);
+ bytes = bytePool.buffer;
+ intUptos[intUptoStart+stream] = offset + bytePool.byteOffset;
+ }
+ bytes[offset] = b;
+ (intUptos[intUptoStart+stream])++;
+ }
+
+ public void writeBytes(int stream, byte[] b, int offset, int len) {
+ // TODO: optimize
+ final int end = offset + len;
+ for(int i=offset;i<end;i++)
+ writeByte(stream, b[i]);
+ }
+
+ void writeVInt(int stream, int i) {
+ assert stream < streamCount;
+ while ((i & ~0x7F) != 0) {
+ writeByte(stream, (byte)((i & 0x7f) | 0x80));
+ i >>>= 7;
+ }
+ writeByte(stream, (byte) i);
+ }
+
+ @Override
+ void finish() throws IOException {
+ try {
+ consumer.finish();
+ } finally {
+ if (nextPerField != null) {
+ nextPerField.finish();
+ }
+ }
+ }
+
+ /** Called when postings hash is too small (> 50%
+ * occupied) or too large (< 20% occupied). */
+ void rehashPostings(final int newSize) {
+
+ final int newMask = newSize-1;
+
+ int[] newHash = new int[newSize];
+ Arrays.fill(newHash, -1);
+ for(int i=0;i<postingsHashSize;i++) {
+ int termID = postingsHash[i];
+ if (termID != -1) {
+ int code;
+ if (perThread.primary) {
+ final int textStart = postingsArray.textStarts[termID];
+ final int start = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
+ final char[] text = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+ int pos = start;
+ while(text[pos] != 0xffff)
+ pos++;
+ code = 0;
+ while (pos > start)
+ code = (code*31) + text[--pos];
+ } else
+ code = postingsArray.textStarts[termID];
+
+ int hashPos = code & newMask;
+ assert hashPos >= 0;
+ if (newHash[hashPos] != -1) {
+ final int inc = ((code>>8)+code)|1;
+ do {
+ code += inc;
+ hashPos = code & newMask;
+ } while (newHash[hashPos] != -1);
+ }
+ newHash[hashPos] = termID;
+ }
+ }
+
+ postingsHashMask = newMask;
+ postingsHash = newHash;
+
+ postingsHashSize = newSize;
+ postingsHashHalfSize = newSize >> 1;
+ }
+}