X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java?ds=inline diff --git a/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java new file mode 100644 index 0000000..e05ba80 --- /dev/null +++ b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java @@ -0,0 +1,581 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.document.Fieldable; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.SorterTemplate; + +final class TermsHashPerField extends InvertedDocConsumerPerField { + + final TermsHashConsumerPerField consumer; + + final TermsHashPerField nextPerField; + final TermsHashPerThread perThread; + final DocumentsWriter.DocState docState; + final FieldInvertState fieldState; + CharTermAttribute termAtt; + + // Copied from our perThread + final CharBlockPool charPool; + final IntBlockPool intPool; + final ByteBlockPool bytePool; + + final int streamCount; + final int numPostingInt; + + final FieldInfo fieldInfo; + + boolean postingsCompacted; + int numPostings; + private int postingsHashSize = 4; + private int postingsHashHalfSize = postingsHashSize/2; + private int postingsHashMask = postingsHashSize-1; + private int[] postingsHash; + + ParallelPostingsArray postingsArray; + + public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) { + this.perThread = perThread; + intPool = perThread.intPool; + charPool = perThread.charPool; + bytePool = perThread.bytePool; + docState = perThread.docState; + + postingsHash = new int[postingsHashSize]; + Arrays.fill(postingsHash, -1); + bytesUsed(postingsHashSize * RamUsageEstimator.NUM_BYTES_INT); + + fieldState = docInverterPerField.fieldState; + this.consumer = perThread.consumer.addField(this, fieldInfo); + initPostingsArray(); + + streamCount = consumer.getStreamCount(); + numPostingInt = 2*streamCount; + this.fieldInfo = fieldInfo; + if (nextPerThread != null) + nextPerField = (TermsHashPerField) nextPerThread.addField(docInverterPerField, fieldInfo); + else + nextPerField = null; + } + + private void initPostingsArray() { + postingsArray = consumer.createPostingsArray(2); + bytesUsed(postingsArray.size * postingsArray.bytesPerPosting()); + } + + // sugar: just forwards to DW + private void bytesUsed(long size) { + if (perThread.termsHash.trackAllocations) { + perThread.termsHash.docWriter.bytesUsed(size); + } + } + + void shrinkHash(int targetSize) { + assert postingsCompacted || numPostings == 0; + + final int newSize = 4; + if (newSize != postingsHash.length) { + final long previousSize = postingsHash.length; + postingsHash = new int[newSize]; + bytesUsed((newSize-previousSize)*RamUsageEstimator.NUM_BYTES_INT); + Arrays.fill(postingsHash, -1); + postingsHashSize = newSize; + postingsHashHalfSize = newSize/2; + postingsHashMask = newSize-1; + } + + // Fully free the postings array on each flush: + if (postingsArray != null) { + bytesUsed(-postingsArray.bytesPerPosting() * postingsArray.size); + postingsArray = null; + } + } + + public void reset() { + if (!postingsCompacted) + compactPostings(); + assert numPostings <= postingsHash.length; + if (numPostings > 0) { + Arrays.fill(postingsHash, 0, numPostings, -1); + numPostings = 0; + } + postingsCompacted = false; + if (nextPerField != null) + nextPerField.reset(); + } + + @Override + synchronized public void abort() { + reset(); + if (nextPerField != null) + nextPerField.abort(); + } + + private final void growParallelPostingsArray() { + int oldSize = postingsArray.size; + this.postingsArray = this.postingsArray.grow(); + bytesUsed(postingsArray.bytesPerPosting() * (postingsArray.size - oldSize)); + } + + public void initReader(ByteSliceReader reader, int termID, int stream) { + assert stream < streamCount; + int intStart = postingsArray.intStarts[termID]; + final int[] ints = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT]; + final int upto = intStart & DocumentsWriter.INT_BLOCK_MASK; + reader.init(bytePool, + postingsArray.byteStarts[termID]+stream*ByteBlockPool.FIRST_LEVEL_SIZE, + ints[upto+stream]); + } + + private void compactPostings() { + int upto = 0; + for(int i=0;i>8)+code)|1; + do { + code += inc; + hashPos = code & postingsHashMask; + termID = postingsHash[hashPos]; + } while (termID != -1 && postingsArray.textStarts[termID] != textStart); + } + + if (termID == -1) { + + // First time we are seeing this token since we last + // flushed the hash. + + // New posting + termID = numPostings++; + if (termID >= postingsArray.size) { + growParallelPostingsArray(); + } + + assert termID >= 0; + + postingsArray.textStarts[termID] = textStart; + + assert postingsHash[hashPos] == -1; + postingsHash[hashPos] = termID; + + if (numPostings == postingsHashHalfSize) + rehashPostings(2*postingsHashSize); + + // Init stream slices + if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE) + intPool.nextBuffer(); + + if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) + bytePool.nextBuffer(); + + intUptos = intPool.buffer; + intUptoStart = intPool.intUpto; + intPool.intUpto += streamCount; + + postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset; + + for(int i=0;i> DocumentsWriter.INT_BLOCK_SHIFT]; + intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK; + consumer.addTerm(termID); + } + } + + // Primary entry point (for first TermsHash) + @Override + void add() throws IOException { + + assert !postingsCompacted; + + // We are first in the chain so we must "intern" the + // term text into textStart address + + // Get the text of this term. + final char[] tokenText = termAtt.buffer(); + final int tokenTextLen = termAtt.length(); + + // Compute hashcode & replace any invalid UTF16 sequences + int downto = tokenTextLen; + int code = 0; + while (downto > 0) { + char ch = tokenText[--downto]; + + if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END) { + if (0 == downto) { + // Unpaired + ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR; + } else { + final char ch2 = tokenText[downto-1]; + if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END) { + // OK: high followed by low. This is a valid + // surrogate pair. + code = ((code*31) + ch)*31+ch2; + downto--; + continue; + } else { + // Unpaired + ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR; + } + } + } else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && (ch <= UnicodeUtil.UNI_SUR_HIGH_END || + ch == 0xffff)) { + // Unpaired or 0xffff + ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR; + } + + code = (code*31) + ch; + } + + int hashPos = code & postingsHashMask; + + // Locate RawPostingList in hash + int termID = postingsHash[hashPos]; + + if (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen)) { + // Conflict: keep searching different locations in + // the hash table. + final int inc = ((code>>8)+code)|1; + do { + code += inc; + hashPos = code & postingsHashMask; + termID = postingsHash[hashPos]; + } while (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen)); + } + + if (termID == -1) { + + // First time we are seeing this token since we last + // flushed the hash. + final int textLen1 = 1+tokenTextLen; + if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE) { + if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE) { + // Just skip this term, to remain as robust as + // possible during indexing. A TokenFilter + // can be inserted into the analyzer chain if + // other behavior is wanted (pruning the term + // to a prefix, throwing an exception, etc). + + if (docState.maxTermPrefix == null) + docState.maxTermPrefix = new String(tokenText, 0, 30); + + consumer.skippingLongTerm(); + return; + } + charPool.nextBuffer(); + } + + // New posting + termID = numPostings++; + if (termID >= postingsArray.size) { + growParallelPostingsArray(); + } + + assert termID != -1; + + final char[] text = charPool.buffer; + final int textUpto = charPool.charUpto; + postingsArray.textStarts[termID] = textUpto + charPool.charOffset; + charPool.charUpto += textLen1; + System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen); + text[textUpto+tokenTextLen] = 0xffff; + + assert postingsHash[hashPos] == -1; + postingsHash[hashPos] = termID; + + if (numPostings == postingsHashHalfSize) { + rehashPostings(2*postingsHashSize); + bytesUsed(2*numPostings * RamUsageEstimator.NUM_BYTES_INT); + } + + // Init stream slices + if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE) + intPool.nextBuffer(); + + if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) + bytePool.nextBuffer(); + + intUptos = intPool.buffer; + intUptoStart = intPool.intUpto; + intPool.intUpto += streamCount; + + postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset; + + for(int i=0;i> DocumentsWriter.INT_BLOCK_SHIFT]; + intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK; + consumer.addTerm(termID); + } + + if (doNextCall) + nextPerField.add(postingsArray.textStarts[termID]); + } + + int[] intUptos; + int intUptoStart; + + void writeByte(int stream, byte b) { + int upto = intUptos[intUptoStart+stream]; + byte[] bytes = bytePool.buffers[upto >> DocumentsWriter.BYTE_BLOCK_SHIFT]; + assert bytes != null; + int offset = upto & DocumentsWriter.BYTE_BLOCK_MASK; + if (bytes[offset] != 0) { + // End of slice; allocate a new one + offset = bytePool.allocSlice(bytes, offset); + bytes = bytePool.buffer; + intUptos[intUptoStart+stream] = offset + bytePool.byteOffset; + } + bytes[offset] = b; + (intUptos[intUptoStart+stream])++; + } + + public void writeBytes(int stream, byte[] b, int offset, int len) { + // TODO: optimize + final int end = offset + len; + for(int i=offset;i>>= 7; + } + writeByte(stream, (byte) i); + } + + @Override + void finish() throws IOException { + try { + consumer.finish(); + } finally { + if (nextPerField != null) { + nextPerField.finish(); + } + } + } + + /** Called when postings hash is too small (> 50% + * occupied) or too large (< 20% occupied). */ + void rehashPostings(final int newSize) { + + final int newMask = newSize-1; + + int[] newHash = new int[newSize]; + Arrays.fill(newHash, -1); + for(int i=0;i> DocumentsWriter.CHAR_BLOCK_SHIFT]; + int pos = start; + while(text[pos] != 0xffff) + pos++; + code = 0; + while (pos > start) + code = (code*31) + text[--pos]; + } else + code = postingsArray.textStarts[termID]; + + int hashPos = code & newMask; + assert hashPos >= 0; + if (newHash[hashPos] != -1) { + final int inc = ((code>>8)+code)|1; + do { + code += inc; + hashPos = code & newMask; + } while (newHash[hashPos] != -1); + } + newHash[hashPos] = termID; + } + } + + postingsHashMask = newMask; + postingsHash = newHash; + + postingsHashSize = newSize; + postingsHashHalfSize = newSize >> 1; + } +}