1 package org.apache.lucene.index;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.util.Arrays;
23 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
24 import org.apache.lucene.document.Fieldable;
25 import org.apache.lucene.util.UnicodeUtil;
26 import org.apache.lucene.util.RamUsageEstimator;
27 import org.apache.lucene.util.SorterTemplate;
29 final class TermsHashPerField extends InvertedDocConsumerPerField {
31 final TermsHashConsumerPerField consumer;
33 final TermsHashPerField nextPerField;
34 final TermsHashPerThread perThread;
35 final DocumentsWriter.DocState docState;
36 final FieldInvertState fieldState;
37 CharTermAttribute termAtt;
39 // Copied from our perThread
40 final CharBlockPool charPool;
41 final IntBlockPool intPool;
42 final ByteBlockPool bytePool;
44 final int streamCount;
45 final int numPostingInt;
47 final FieldInfo fieldInfo;
49 boolean postingsCompacted;
51 private int postingsHashSize = 4;
52 private int postingsHashHalfSize = postingsHashSize/2;
53 private int postingsHashMask = postingsHashSize-1;
54 private int[] postingsHash;
56 ParallelPostingsArray postingsArray;
58 public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) {
59 this.perThread = perThread;
60 intPool = perThread.intPool;
61 charPool = perThread.charPool;
62 bytePool = perThread.bytePool;
63 docState = perThread.docState;
65 postingsHash = new int[postingsHashSize];
66 Arrays.fill(postingsHash, -1);
67 bytesUsed(postingsHashSize * RamUsageEstimator.NUM_BYTES_INT);
69 fieldState = docInverterPerField.fieldState;
70 this.consumer = perThread.consumer.addField(this, fieldInfo);
73 streamCount = consumer.getStreamCount();
74 numPostingInt = 2*streamCount;
75 this.fieldInfo = fieldInfo;
76 if (nextPerThread != null)
77 nextPerField = (TermsHashPerField) nextPerThread.addField(docInverterPerField, fieldInfo);
82 private void initPostingsArray() {
83 postingsArray = consumer.createPostingsArray(2);
84 bytesUsed(postingsArray.size * postingsArray.bytesPerPosting());
87 // sugar: just forwards to DW
88 private void bytesUsed(long size) {
89 if (perThread.termsHash.trackAllocations) {
90 perThread.termsHash.docWriter.bytesUsed(size);
94 void shrinkHash(int targetSize) {
95 assert postingsCompacted || numPostings == 0;
97 final int newSize = 4;
98 if (newSize != postingsHash.length) {
99 final long previousSize = postingsHash.length;
100 postingsHash = new int[newSize];
101 bytesUsed((newSize-previousSize)*RamUsageEstimator.NUM_BYTES_INT);
102 Arrays.fill(postingsHash, -1);
103 postingsHashSize = newSize;
104 postingsHashHalfSize = newSize/2;
105 postingsHashMask = newSize-1;
108 // Fully free the postings array on each flush:
109 if (postingsArray != null) {
110 bytesUsed(-postingsArray.bytesPerPosting() * postingsArray.size);
111 postingsArray = null;
115 public void reset() {
116 if (!postingsCompacted)
118 assert numPostings <= postingsHash.length;
119 if (numPostings > 0) {
120 Arrays.fill(postingsHash, 0, numPostings, -1);
123 postingsCompacted = false;
124 if (nextPerField != null)
125 nextPerField.reset();
129 synchronized public void abort() {
131 if (nextPerField != null)
132 nextPerField.abort();
135 private final void growParallelPostingsArray() {
136 int oldSize = postingsArray.size;
137 this.postingsArray = this.postingsArray.grow();
138 bytesUsed(postingsArray.bytesPerPosting() * (postingsArray.size - oldSize));
141 public void initReader(ByteSliceReader reader, int termID, int stream) {
142 assert stream < streamCount;
143 int intStart = postingsArray.intStarts[termID];
144 final int[] ints = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
145 final int upto = intStart & DocumentsWriter.INT_BLOCK_MASK;
146 reader.init(bytePool,
147 postingsArray.byteStarts[termID]+stream*ByteBlockPool.FIRST_LEVEL_SIZE,
151 private void compactPostings() {
153 for(int i=0;i<postingsHashSize;i++) {
154 if (postingsHash[i] != -1) {
156 postingsHash[upto] = postingsHash[i];
157 postingsHash[i] = -1;
163 assert upto == numPostings: "upto=" + upto + " numPostings=" + numPostings;
164 postingsCompacted = true;
167 /** Collapse the hash table & sort in-place. */
168 public int[] sortPostings() {
170 final int[] postingsHash = this.postingsHash;
171 new SorterTemplate() {
173 protected void swap(int i, int j) {
174 final int o = postingsHash[i];
175 postingsHash[i] = postingsHash[j];
180 protected int compare(int i, int j) {
181 final int term1 = postingsHash[i], term2 = postingsHash[j];
184 final int textStart1 = postingsArray.textStarts[term1],
185 textStart2 = postingsArray.textStarts[term2];
186 final char[] text1 = charPool.buffers[textStart1 >> DocumentsWriter.CHAR_BLOCK_SHIFT];
187 final int pos1 = textStart1 & DocumentsWriter.CHAR_BLOCK_MASK;
188 final char[] text2 = charPool.buffers[textStart2 >> DocumentsWriter.CHAR_BLOCK_SHIFT];
189 final int pos2 = textStart2 & DocumentsWriter.CHAR_BLOCK_MASK;
190 return comparePostings(text1, pos1, text2, pos2);
194 protected void setPivot(int i) {
195 pivotTerm = postingsHash[i];
196 final int textStart = postingsArray.textStarts[pivotTerm];
197 pivotBuf = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
198 pivotBufPos = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
202 protected int comparePivot(int j) {
203 final int term = postingsHash[j];
204 if (pivotTerm == term)
206 final int textStart = postingsArray.textStarts[term];
207 final char[] text = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
208 final int pos = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
209 return comparePostings(pivotBuf, pivotBufPos, text, pos);
212 private int pivotTerm, pivotBufPos;
213 private char[] pivotBuf;
215 /** Compares term text for two Posting instance and
216 * returns -1 if p1 < p2; 1 if p1 > p2; else 0. */
217 private int comparePostings(final char[] text1, int pos1, final char[] text2, int pos2) {
218 assert text1 != text2 || pos1 != pos2;
221 final char c1 = text1[pos1++];
222 final char c2 = text2[pos2++];
226 else if (0xffff == c1)
231 // This method should never compare equal postings
236 }.quickSort(0, numPostings-1);
240 /** Test whether the text for current RawPostingList p equals
241 * current tokenText. */
242 private boolean postingEquals(final int termID, final char[] tokenText, final int tokenTextLen) {
243 final int textStart = postingsArray.textStarts[termID];
245 final char[] text = perThread.charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
247 int pos = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
250 for(;tokenPos<tokenTextLen;pos++,tokenPos++)
251 if (tokenText[tokenPos] != text[pos])
253 return 0xffff == text[pos];
256 private boolean doCall;
257 private boolean doNextCall;
260 void start(Fieldable f) {
261 termAtt = fieldState.attributeSource.addAttribute(CharTermAttribute.class);
263 if (nextPerField != null) {
264 nextPerField.start(f);
269 boolean start(Fieldable[] fields, int count) throws IOException {
270 doCall = consumer.start(fields, count);
271 if (postingsArray == null) {
275 if (nextPerField != null)
276 doNextCall = nextPerField.start(fields, count);
277 return doCall || doNextCall;
280 // Secondary entry point (for 2nd & subsequent TermsHash),
281 // because token text has already been "interned" into
282 // textStart, so we hash by textStart
283 public void add(int textStart) throws IOException {
284 int code = textStart;
286 int hashPos = code & postingsHashMask;
288 assert !postingsCompacted;
290 // Locate RawPostingList in hash
291 int termID = postingsHash[hashPos];
293 if (termID != -1 && postingsArray.textStarts[termID] != textStart) {
294 // Conflict: keep searching different locations in
296 final int inc = ((code>>8)+code)|1;
299 hashPos = code & postingsHashMask;
300 termID = postingsHash[hashPos];
301 } while (termID != -1 && postingsArray.textStarts[termID] != textStart);
306 // First time we are seeing this token since we last
310 termID = numPostings++;
311 if (termID >= postingsArray.size) {
312 growParallelPostingsArray();
317 postingsArray.textStarts[termID] = textStart;
319 assert postingsHash[hashPos] == -1;
320 postingsHash[hashPos] = termID;
322 if (numPostings == postingsHashHalfSize)
323 rehashPostings(2*postingsHashSize);
325 // Init stream slices
326 if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
327 intPool.nextBuffer();
329 if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE)
330 bytePool.nextBuffer();
332 intUptos = intPool.buffer;
333 intUptoStart = intPool.intUpto;
334 intPool.intUpto += streamCount;
336 postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;
338 for(int i=0;i<streamCount;i++) {
339 final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
340 intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
342 postingsArray.byteStarts[termID] = intUptos[intUptoStart];
344 consumer.newTerm(termID);
347 int intStart = postingsArray.intStarts[termID];
348 intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
349 intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK;
350 consumer.addTerm(termID);
354 // Primary entry point (for first TermsHash)
356 void add() throws IOException {
358 assert !postingsCompacted;
360 // We are first in the chain so we must "intern" the
361 // term text into textStart address
363 // Get the text of this term.
364 final char[] tokenText = termAtt.buffer();
365 final int tokenTextLen = termAtt.length();
367 // Compute hashcode & replace any invalid UTF16 sequences
368 int downto = tokenTextLen;
371 char ch = tokenText[--downto];
373 if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END) {
376 ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
378 final char ch2 = tokenText[downto-1];
379 if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END) {
380 // OK: high followed by low. This is a valid
382 code = ((code*31) + ch)*31+ch2;
387 ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
390 } else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && (ch <= UnicodeUtil.UNI_SUR_HIGH_END ||
392 // Unpaired or 0xffff
393 ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
396 code = (code*31) + ch;
399 int hashPos = code & postingsHashMask;
401 // Locate RawPostingList in hash
402 int termID = postingsHash[hashPos];
404 if (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen)) {
405 // Conflict: keep searching different locations in
407 final int inc = ((code>>8)+code)|1;
410 hashPos = code & postingsHashMask;
411 termID = postingsHash[hashPos];
412 } while (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen));
417 // First time we are seeing this token since we last
419 final int textLen1 = 1+tokenTextLen;
420 if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE) {
421 if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE) {
422 // Just skip this term, to remain as robust as
423 // possible during indexing. A TokenFilter
424 // can be inserted into the analyzer chain if
425 // other behavior is wanted (pruning the term
426 // to a prefix, throwing an exception, etc).
428 if (docState.maxTermPrefix == null)
429 docState.maxTermPrefix = new String(tokenText, 0, 30);
431 consumer.skippingLongTerm();
434 charPool.nextBuffer();
438 termID = numPostings++;
439 if (termID >= postingsArray.size) {
440 growParallelPostingsArray();
445 final char[] text = charPool.buffer;
446 final int textUpto = charPool.charUpto;
447 postingsArray.textStarts[termID] = textUpto + charPool.charOffset;
448 charPool.charUpto += textLen1;
449 System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen);
450 text[textUpto+tokenTextLen] = 0xffff;
452 assert postingsHash[hashPos] == -1;
453 postingsHash[hashPos] = termID;
455 if (numPostings == postingsHashHalfSize) {
456 rehashPostings(2*postingsHashSize);
457 bytesUsed(2*numPostings * RamUsageEstimator.NUM_BYTES_INT);
460 // Init stream slices
461 if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
462 intPool.nextBuffer();
464 if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE)
465 bytePool.nextBuffer();
467 intUptos = intPool.buffer;
468 intUptoStart = intPool.intUpto;
469 intPool.intUpto += streamCount;
471 postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;
473 for(int i=0;i<streamCount;i++) {
474 final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
475 intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
477 postingsArray.byteStarts[termID] = intUptos[intUptoStart];
479 consumer.newTerm(termID);
482 final int intStart = postingsArray.intStarts[termID];
483 intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
484 intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK;
485 consumer.addTerm(termID);
489 nextPerField.add(postingsArray.textStarts[termID]);
495 void writeByte(int stream, byte b) {
496 int upto = intUptos[intUptoStart+stream];
497 byte[] bytes = bytePool.buffers[upto >> DocumentsWriter.BYTE_BLOCK_SHIFT];
498 assert bytes != null;
499 int offset = upto & DocumentsWriter.BYTE_BLOCK_MASK;
500 if (bytes[offset] != 0) {
501 // End of slice; allocate a new one
502 offset = bytePool.allocSlice(bytes, offset);
503 bytes = bytePool.buffer;
504 intUptos[intUptoStart+stream] = offset + bytePool.byteOffset;
507 (intUptos[intUptoStart+stream])++;
510 public void writeBytes(int stream, byte[] b, int offset, int len) {
512 final int end = offset + len;
513 for(int i=offset;i<end;i++)
514 writeByte(stream, b[i]);
517 void writeVInt(int stream, int i) {
518 assert stream < streamCount;
519 while ((i & ~0x7F) != 0) {
520 writeByte(stream, (byte)((i & 0x7f) | 0x80));
523 writeByte(stream, (byte) i);
527 void finish() throws IOException {
531 if (nextPerField != null) {
532 nextPerField.finish();
537 /** Called when postings hash is too small (> 50%
538 * occupied) or too large (< 20% occupied). */
539 void rehashPostings(final int newSize) {
541 final int newMask = newSize-1;
543 int[] newHash = new int[newSize];
544 Arrays.fill(newHash, -1);
545 for(int i=0;i<postingsHashSize;i++) {
546 int termID = postingsHash[i];
549 if (perThread.primary) {
550 final int textStart = postingsArray.textStarts[termID];
551 final int start = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
552 final char[] text = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
554 while(text[pos] != 0xffff)
558 code = (code*31) + text[--pos];
560 code = postingsArray.textStarts[termID];
562 int hashPos = code & newMask;
564 if (newHash[hashPos] != -1) {
565 final int inc = ((code>>8)+code)|1;
568 hashPos = code & newMask;
569 } while (newHash[hashPos] != -1);
571 newHash[hashPos] = termID;
575 postingsHashMask = newMask;
576 postingsHash = newHash;
578 postingsHashSize = newSize;
579 postingsHashHalfSize = newSize >> 1;