lucene-java-3.4.0/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java

   1 package org.apache.lucene.index;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.util.Arrays;
  22
  23 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  24 import org.apache.lucene.document.Fieldable;
  25 import org.apache.lucene.util.UnicodeUtil;
  26 import org.apache.lucene.util.RamUsageEstimator;
  27 import org.apache.lucene.util.SorterTemplate;
  28
  29 final class TermsHashPerField extends InvertedDocConsumerPerField {
  30
  31   final TermsHashConsumerPerField consumer;
  32
  33   final TermsHashPerField nextPerField;
  34   final TermsHashPerThread perThread;
  35   final DocumentsWriter.DocState docState;
  36   final FieldInvertState fieldState;
  37   CharTermAttribute termAtt;
  38
  39   // Copied from our perThread
  40   final CharBlockPool charPool;
  41   final IntBlockPool intPool;
  42   final ByteBlockPool bytePool;
  43
  44   final int streamCount;
  45   final int numPostingInt;
  46
  47   final FieldInfo fieldInfo;
  48
  49   boolean postingsCompacted;
  50   int numPostings;
  51   private int postingsHashSize = 4;
  52   private int postingsHashHalfSize = postingsHashSize/2;
  53   private int postingsHashMask = postingsHashSize-1;
  54   private int[] postingsHash;
  55
  56   ParallelPostingsArray postingsArray;
  57
  58   public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) {
  59     this.perThread = perThread;
  60     intPool = perThread.intPool;
  61     charPool = perThread.charPool;
  62     bytePool = perThread.bytePool;
  63     docState = perThread.docState;
  64
  65     postingsHash = new int[postingsHashSize];
  66     Arrays.fill(postingsHash, -1);
  67     bytesUsed(postingsHashSize * RamUsageEstimator.NUM_BYTES_INT);
  68
  69     fieldState = docInverterPerField.fieldState;
  70     this.consumer = perThread.consumer.addField(this, fieldInfo);
  71     initPostingsArray();
  72
  73     streamCount = consumer.getStreamCount();
  74     numPostingInt = 2*streamCount;
  75     this.fieldInfo = fieldInfo;
  76     if (nextPerThread != null)
  77       nextPerField = (TermsHashPerField) nextPerThread.addField(docInverterPerField, fieldInfo);
  78     else
  79       nextPerField = null;
  80   }
  81
  82   private void initPostingsArray() {
  83     postingsArray = consumer.createPostingsArray(2);
  84     bytesUsed(postingsArray.size * postingsArray.bytesPerPosting());
  85   }
  86
  87   // sugar: just forwards to DW
  88   private void bytesUsed(long size) {
  89     if (perThread.termsHash.trackAllocations) {
  90       perThread.termsHash.docWriter.bytesUsed(size);
  91     }
  92   }
  93
  94   void shrinkHash(int targetSize) {
  95     assert postingsCompacted || numPostings == 0;
  96
  97     final int newSize = 4;
  98     if (newSize != postingsHash.length) {
  99       final long previousSize = postingsHash.length;
 100       postingsHash = new int[newSize];
 101       bytesUsed((newSize-previousSize)*RamUsageEstimator.NUM_BYTES_INT);
 102       Arrays.fill(postingsHash, -1);
 103       postingsHashSize = newSize;
 104       postingsHashHalfSize = newSize/2;
 105       postingsHashMask = newSize-1;
 106     }
 107
 108     // Fully free the postings array on each flush:
 109     if (postingsArray != null) {
 110       bytesUsed(-postingsArray.bytesPerPosting() * postingsArray.size);
 111       postingsArray = null;
 112     }
 113   }
 114
 115   public void reset() {
 116     if (!postingsCompacted)
 117       compactPostings();
 118     assert numPostings <= postingsHash.length;
 119     if (numPostings > 0) {
 120       Arrays.fill(postingsHash, 0, numPostings, -1);
 121       numPostings = 0;
 122     }
 123     postingsCompacted = false;
 124     if (nextPerField != null)
 125       nextPerField.reset();
 126   }
 127
 128   @Override
 129   synchronized public void abort() {
 130     reset();
 131     if (nextPerField != null)
 132       nextPerField.abort();
 133   }
 134
 135   private final void growParallelPostingsArray() {
 136     int oldSize = postingsArray.size;
 137     this.postingsArray = this.postingsArray.grow();
 138     bytesUsed(postingsArray.bytesPerPosting() * (postingsArray.size - oldSize));
 139   }
 140
 141   public void initReader(ByteSliceReader reader, int termID, int stream) {
 142     assert stream < streamCount;
 143     int intStart = postingsArray.intStarts[termID];
 144     final int[] ints = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
 145     final int upto = intStart & DocumentsWriter.INT_BLOCK_MASK;
 146     reader.init(bytePool,
 147                 postingsArray.byteStarts[termID]+stream*ByteBlockPool.FIRST_LEVEL_SIZE,
 148                 ints[upto+stream]);
 149   }
 150
 151   private void compactPostings() {
 152     int upto = 0;
 153     for(int i=0;i<postingsHashSize;i++) {
 154       if (postingsHash[i] != -1) {
 155         if (upto < i) {
 156           postingsHash[upto] = postingsHash[i];
 157           postingsHash[i] = -1;
 158         }
 159         upto++;
 160       }
 161     }
 162
 163     assert upto == numPostings: "upto=" + upto + " numPostings=" + numPostings;
 164     postingsCompacted = true;
 165   }
 166
 167   /** Collapse the hash table & sort in-place. */
 168   public int[] sortPostings() {
 169     compactPostings();
 170     final int[] postingsHash = this.postingsHash;
 171     new SorterTemplate() {
 172       @Override
 173       protected void swap(int i, int j) {
 174         final int o = postingsHash[i];
 175         postingsHash[i] = postingsHash[j];
 176         postingsHash[j] = o;
 177       }
 178
 179       @Override
 180       protected int compare(int i, int j) {
 181         final int term1 = postingsHash[i], term2 = postingsHash[j];
 182         if (term1 == term2)
 183           return 0;
 184         final int textStart1 = postingsArray.textStarts[term1],
 185           textStart2 = postingsArray.textStarts[term2];
 186         final char[] text1 = charPool.buffers[textStart1 >> DocumentsWriter.CHAR_BLOCK_SHIFT];
 187         final int pos1 = textStart1 & DocumentsWriter.CHAR_BLOCK_MASK;
 188         final char[] text2 = charPool.buffers[textStart2 >> DocumentsWriter.CHAR_BLOCK_SHIFT];
 189         final int pos2 = textStart2 & DocumentsWriter.CHAR_BLOCK_MASK;
 190         return comparePostings(text1, pos1, text2, pos2);
 191       }
 192
 193       @Override
 194       protected void setPivot(int i) {
 195         pivotTerm = postingsHash[i];
 196         final int textStart = postingsArray.textStarts[pivotTerm];
 197         pivotBuf = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
 198         pivotBufPos = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
 199       }
 200
 201       @Override
 202       protected int comparePivot(int j) {
 203         final int term = postingsHash[j];
 204         if (pivotTerm == term)
 205           return 0;
 206         final int textStart = postingsArray.textStarts[term];
 207         final char[] text = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
 208         final int pos = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
 209         return comparePostings(pivotBuf, pivotBufPos, text, pos);
 210       }
 211
 212       private int pivotTerm, pivotBufPos;
 213       private char[] pivotBuf;
 214
 215       /** Compares term text for two Posting instance and
 216        *  returns -1 if p1 < p2; 1 if p1 > p2; else 0. */
 217       private int comparePostings(final char[] text1, int pos1, final char[] text2, int pos2) {
 218         assert text1 != text2 || pos1 != pos2;
 219
 220         while(true) {
 221           final char c1 = text1[pos1++];
 222           final char c2 = text2[pos2++];
 223           if (c1 != c2) {
 224             if (0xffff == c2)
 225               return 1;
 226             else if (0xffff == c1)
 227               return -1;
 228             else
 229               return c1-c2;
 230           } else
 231             // This method should never compare equal postings
 232             // unless p1==p2
 233             assert c1 != 0xffff;
 234         }
 235       }
 236     }.quickSort(0, numPostings-1);
 237     return postingsHash;
 238   }
 239
 240   /** Test whether the text for current RawPostingList p equals
 241    *  current tokenText. */
 242   private boolean postingEquals(final int termID, final char[] tokenText, final int tokenTextLen) {
 243     final int textStart = postingsArray.textStarts[termID];
 244
 245     final char[] text = perThread.charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
 246     assert text != null;
 247     int pos = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
 248
 249     int tokenPos = 0;
 250     for(;tokenPos<tokenTextLen;pos++,tokenPos++)
 251       if (tokenText[tokenPos] != text[pos])
 252         return false;
 253     return 0xffff == text[pos];
 254   }
 255
 256   private boolean doCall;
 257   private boolean doNextCall;
 258
 259   @Override
 260   void start(Fieldable f) {
 261     termAtt = fieldState.attributeSource.addAttribute(CharTermAttribute.class);
 262     consumer.start(f);
 263     if (nextPerField != null) {
 264       nextPerField.start(f);
 265     }
 266   }
 267
 268   @Override
 269   boolean start(Fieldable[] fields, int count) throws IOException {
 270     doCall = consumer.start(fields, count);
 271     if (postingsArray == null) {
 272       initPostingsArray();
 273     }
 274
 275     if (nextPerField != null)
 276       doNextCall = nextPerField.start(fields, count);
 277     return doCall || doNextCall;
 278   }
 279
 280   // Secondary entry point (for 2nd & subsequent TermsHash),
 281   // because token text has already been "interned" into
 282   // textStart, so we hash by textStart
 283   public void add(int textStart) throws IOException {
 284     int code = textStart;
 285
 286     int hashPos = code & postingsHashMask;
 287
 288     assert !postingsCompacted;
 289
 290     // Locate RawPostingList in hash
 291     int termID = postingsHash[hashPos];
 292
 293     if (termID != -1 && postingsArray.textStarts[termID] != textStart) {
 294       // Conflict: keep searching different locations in
 295       // the hash table.
 296       final int inc = ((code>>8)+code)|1;
 297       do {
 298         code += inc;
 299         hashPos = code & postingsHashMask;
 300         termID = postingsHash[hashPos];
 301       } while (termID != -1 && postingsArray.textStarts[termID] != textStart);
 302     }
 303
 304     if (termID == -1) {
 305
 306       // First time we are seeing this token since we last
 307       // flushed the hash.
 308
 309       // New posting
 310       termID = numPostings++;
 311       if (termID >= postingsArray.size) {
 312         growParallelPostingsArray();
 313       }
 314
 315       assert termID >= 0;
 316
 317       postingsArray.textStarts[termID] = textStart;
 318
 319       assert postingsHash[hashPos] == -1;
 320       postingsHash[hashPos] = termID;
 321
 322       if (numPostings == postingsHashHalfSize)
 323         rehashPostings(2*postingsHashSize);
 324
 325       // Init stream slices
 326       if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
 327         intPool.nextBuffer();
 328
 329       if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE)
 330         bytePool.nextBuffer();
 331
 332       intUptos = intPool.buffer;
 333       intUptoStart = intPool.intUpto;
 334       intPool.intUpto += streamCount;
 335
 336       postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;
 337
 338       for(int i=0;i<streamCount;i++) {
 339         final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
 340         intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
 341       }
 342       postingsArray.byteStarts[termID] = intUptos[intUptoStart];
 343
 344       consumer.newTerm(termID);
 345
 346     } else {
 347       int intStart = postingsArray.intStarts[termID];
 348       intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
 349       intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK;
 350       consumer.addTerm(termID);
 351     }
 352   }
 353
 354   // Primary entry point (for first TermsHash)
 355   @Override
 356   void add() throws IOException {
 357
 358     assert !postingsCompacted;
 359
 360     // We are first in the chain so we must "intern" the
 361     // term text into textStart address
 362
 363     // Get the text of this term.
 364     final char[] tokenText = termAtt.buffer();
 365     final int tokenTextLen = termAtt.length();
 366
 367     // Compute hashcode & replace any invalid UTF16 sequences
 368     int downto = tokenTextLen;
 369     int code = 0;
 370     while (downto > 0) {
 371       char ch = tokenText[--downto];
 372
 373       if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END) {
 374         if (0 == downto) {
 375           // Unpaired
 376           ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
 377         } else {
 378           final char ch2 = tokenText[downto-1];
 379           if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END) {
 380             // OK: high followed by low.  This is a valid
 381             // surrogate pair.
 382             code = ((code*31) + ch)*31+ch2;
 383             downto--;
 384             continue;
 385           } else {
 386             // Unpaired
 387             ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
 388           }
 389         }
 390       } else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && (ch <= UnicodeUtil.UNI_SUR_HIGH_END ||
 391                                                           ch == 0xffff)) {
 392         // Unpaired or 0xffff
 393         ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
 394       }
 395
 396       code = (code*31) + ch;
 397     }
 398
 399     int hashPos = code & postingsHashMask;
 400
 401     // Locate RawPostingList in hash
 402     int termID = postingsHash[hashPos];
 403
 404     if (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen)) {
 405       // Conflict: keep searching different locations in
 406       // the hash table.
 407       final int inc = ((code>>8)+code)|1;
 408       do {
 409         code += inc;
 410         hashPos = code & postingsHashMask;
 411         termID = postingsHash[hashPos];
 412       } while (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen));
 413     }
 414
 415     if (termID == -1) {
 416
 417       // First time we are seeing this token since we last
 418       // flushed the hash.
 419       final int textLen1 = 1+tokenTextLen;
 420       if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE) {
 421         if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE) {
 422           // Just skip this term, to remain as robust as
 423           // possible during indexing.  A TokenFilter
 424           // can be inserted into the analyzer chain if
 425           // other behavior is wanted (pruning the term
 426           // to a prefix, throwing an exception, etc).
 427
 428           if (docState.maxTermPrefix == null)
 429             docState.maxTermPrefix = new String(tokenText, 0, 30);
 430
 431           consumer.skippingLongTerm();
 432           return;
 433         }
 434         charPool.nextBuffer();
 435       }
 436
 437       // New posting
 438       termID = numPostings++;
 439       if (termID >= postingsArray.size) {
 440         growParallelPostingsArray();
 441       }
 442
 443       assert termID != -1;
 444
 445       final char[] text = charPool.buffer;
 446       final int textUpto = charPool.charUpto;
 447       postingsArray.textStarts[termID] = textUpto + charPool.charOffset;
 448       charPool.charUpto += textLen1;
 449       System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen);
 450       text[textUpto+tokenTextLen] = 0xffff;
 451
 452       assert postingsHash[hashPos] == -1;
 453       postingsHash[hashPos] = termID;
 454
 455       if (numPostings == postingsHashHalfSize) {
 456         rehashPostings(2*postingsHashSize);
 457         bytesUsed(2*numPostings * RamUsageEstimator.NUM_BYTES_INT);
 458       }
 459
 460       // Init stream slices
 461       if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
 462         intPool.nextBuffer();
 463
 464       if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE)
 465         bytePool.nextBuffer();
 466
 467       intUptos = intPool.buffer;
 468       intUptoStart = intPool.intUpto;
 469       intPool.intUpto += streamCount;
 470
 471       postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;
 472
 473       for(int i=0;i<streamCount;i++) {
 474         final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
 475         intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
 476       }
 477       postingsArray.byteStarts[termID] = intUptos[intUptoStart];
 478
 479       consumer.newTerm(termID);
 480
 481     } else {
 482       final int intStart = postingsArray.intStarts[termID];
 483       intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
 484       intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK;
 485       consumer.addTerm(termID);
 486     }
 487
 488     if (doNextCall)
 489       nextPerField.add(postingsArray.textStarts[termID]);
 490   }
 491
 492   int[] intUptos;
 493   int intUptoStart;
 494
 495   void writeByte(int stream, byte b) {
 496     int upto = intUptos[intUptoStart+stream];
 497     byte[] bytes = bytePool.buffers[upto >> DocumentsWriter.BYTE_BLOCK_SHIFT];
 498     assert bytes != null;
 499     int offset = upto & DocumentsWriter.BYTE_BLOCK_MASK;
 500     if (bytes[offset] != 0) {
 501       // End of slice; allocate a new one
 502       offset = bytePool.allocSlice(bytes, offset);
 503       bytes = bytePool.buffer;
 504       intUptos[intUptoStart+stream] = offset + bytePool.byteOffset;
 505     }
 506     bytes[offset] = b;
 507     (intUptos[intUptoStart+stream])++;
 508   }
 509
 510   public void writeBytes(int stream, byte[] b, int offset, int len) {
 511     // TODO: optimize
 512     final int end = offset + len;
 513     for(int i=offset;i<end;i++)
 514       writeByte(stream, b[i]);
 515   }
 516
 517   void writeVInt(int stream, int i) {
 518     assert stream < streamCount;
 519     while ((i & ~0x7F) != 0) {
 520       writeByte(stream, (byte)((i & 0x7f) | 0x80));
 521       i >>>= 7;
 522     }
 523     writeByte(stream, (byte) i);
 524   }
 525
 526   @Override
 527   void finish() throws IOException {
 528     try {
 529       consumer.finish();
 530     } finally {
 531       if (nextPerField != null) {
 532         nextPerField.finish();
 533       }
 534     }
 535   }
 536
 537   /** Called when postings hash is too small (> 50%
 538    *  occupied) or too large (< 20% occupied). */
 539   void rehashPostings(final int newSize) {
 540
 541     final int newMask = newSize-1;
 542
 543     int[] newHash = new int[newSize];
 544     Arrays.fill(newHash, -1);
 545     for(int i=0;i<postingsHashSize;i++) {
 546       int termID = postingsHash[i];
 547       if (termID != -1) {
 548         int code;
 549         if (perThread.primary) {
 550           final int textStart = postingsArray.textStarts[termID];
 551           final int start = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
 552           final char[] text = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
 553           int pos = start;
 554           while(text[pos] != 0xffff)
 555             pos++;
 556           code = 0;
 557           while (pos > start)
 558             code = (code*31) + text[--pos];
 559         } else
 560           code = postingsArray.textStarts[termID];
 561
 562         int hashPos = code & newMask;
 563         assert hashPos >= 0;
 564         if (newHash[hashPos] != -1) {
 565           final int inc = ((code>>8)+code)|1;
 566           do {
 567             code += inc;
 568             hashPos = code & newMask;
 569           } while (newHash[hashPos] != -1);
 570         }
 571         newHash[hashPos] = termID;
 572       }
 573     }
 574
 575     postingsHashMask = newMask;
 576     postingsHash = newHash;
 577
 578     postingsHashSize = newSize;
 579     postingsHashHalfSize = newSize >> 1;
 580   }
 581 }