lucene-java-3.5.0/lucene/src/java/org/apache/lucene/util/BytesRefHash.java

   1 package org.apache.lucene.util;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_MASK;
  21 import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SHIFT;
  22 import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
  23
  24 import java.util.Arrays;
  25 import java.util.Comparator;
  26 import java.util.concurrent.atomic.AtomicLong;
  27
  28 import org.apache.lucene.util.ByteBlockPool.DirectAllocator;
  29
  30 /**
  31  * {@link BytesRefHash} is a special purpose hash-map like data-structure
  32  * optimized for {@link BytesRef} instances. BytesRefHash maintains mappings of
  33  * byte arrays to ordinal (Map<BytesRef,int>) storing the hashed bytes
  34  * efficiently in continuous storage. The mapping to the ordinal is
  35  * encapsulated inside {@link BytesRefHash} and is guaranteed to be increased
  36  * for each added {@link BytesRef}.
  37  *
  38  * <p>
  39  * Note: The maximum capacity {@link BytesRef} instance passed to
  40  * {@link #add(BytesRef)} must not be longer than {@link ByteBlockPool#BYTE_BLOCK_SIZE}-2.
  41  * The internal storage is limited to 2GB total byte storage.
  42  * </p>
  43  *
  44  * @lucene.internal
  45  */
  46 public final class BytesRefHash {
  47
  48   public static final int DEFAULT_CAPACITY = 16;
  49
  50   // the following fields are needed by comparator,
  51   // so package private to prevent access$-methods:
  52   final ByteBlockPool pool;
  53   int[] bytesStart;
  54
  55   private final BytesRef scratch1 = new BytesRef();
  56   private int hashSize;
  57   private int hashHalfSize;
  58   private int hashMask;
  59   private int count;
  60   private int lastCount = -1;
  61   private int[] ords;
  62   private final BytesStartArray bytesStartArray;
  63   private AtomicLong bytesUsed;
  64
  65   /**
  66    * Creates a new {@link BytesRefHash} with a {@link ByteBlockPool} using a
  67    * {@link DirectAllocator}.
  68    */
  69   public BytesRefHash() {
  70     this(new ByteBlockPool(new DirectAllocator()));
  71   }
  72
  73   /**
  74    * Creates a new {@link BytesRefHash}
  75    */
  76   public BytesRefHash(ByteBlockPool pool) {
  77     this(pool, DEFAULT_CAPACITY, new DirectBytesStartArray(DEFAULT_CAPACITY));
  78   }
  79
  80   /**
  81    * Creates a new {@link BytesRefHash}
  82    */
  83   public BytesRefHash(ByteBlockPool pool, int capacity,
  84       BytesStartArray bytesStartArray) {
  85     hashSize = capacity;
  86     hashHalfSize = hashSize >> 1;
  87     hashMask = hashSize - 1;
  88     this.pool = pool;
  89     ords = new int[hashSize];
  90     Arrays.fill(ords, -1);
  91     this.bytesStartArray = bytesStartArray;
  92     bytesStart = bytesStartArray.init();
  93     bytesUsed = bytesStartArray.bytesUsed() == null? new AtomicLong(0) : bytesStartArray.bytesUsed();;
  94     bytesUsed.addAndGet(hashSize * RamUsageEstimator.NUM_BYTES_INT);
  95   }
  96
  97   /**
  98    * Returns the number of {@link BytesRef} values in this {@link BytesRefHash}.
  99    *
 100    * @return the number of {@link BytesRef} values in this {@link BytesRefHash}.
 101    */
 102   public int size() {
 103     return count;
 104   }
 105
 106   /**
 107    * Populates and returns a {@link BytesRef} with the bytes for the given ord.
 108    * <p>
 109    * Note: the given ord must be a positive integer less that the current size (
 110    * {@link #size()})
 111    * </p>
 112    *
 113    * @param ord the ord
 114    * @param ref the {@link BytesRef} to populate
 115    *
 116    * @return the given BytesRef instance populated with the bytes for the given ord
 117    */
 118   public BytesRef get(int ord, BytesRef ref) {
 119     assert bytesStart != null : "bytesStart is null - not initialized";
 120     assert ord < bytesStart.length: "ord exceeds byteStart len: " + bytesStart.length;
 121     return pool.setBytesRef(ref, bytesStart[ord]);
 122   }
 123
 124   /**
 125    * Returns the ords array in arbitrary order. Valid ords start at offset of 0
 126    * and end at a limit of {@link #size()} - 1
 127    * <p>
 128    * Note: This is a destructive operation. {@link #clear()} must be called in
 129    * order to reuse this {@link BytesRefHash} instance.
 130    * </p>
 131    */
 132   public int[] compact() {
 133     assert bytesStart != null : "Bytesstart is null - not initialized";
 134     int upto = 0;
 135     for (int i = 0; i < hashSize; i++) {
 136       if (ords[i] != -1) {
 137         if (upto < i) {
 138           ords[upto] = ords[i];
 139           ords[i] = -1;
 140         }
 141         upto++;
 142       }
 143     }
 144
 145     assert upto == count;
 146     lastCount = count;
 147     return ords;
 148   }
 149
 150   /**
 151    * Returns the values array sorted by the referenced byte values.
 152    * <p>
 153    * Note: This is a destructive operation. {@link #clear()} must be called in
 154    * order to reuse this {@link BytesRefHash} instance.
 155    * </p>
 156    *
 157    * @param comp
 158    *          the {@link Comparator} used for sorting
 159    */
 160   public int[] sort(final Comparator<BytesRef> comp) {
 161     final int[] compact = compact();
 162     new SorterTemplate() {
 163       @Override
 164       protected void swap(int i, int j) {
 165         final int o = compact[i];
 166         compact[i] = compact[j];
 167         compact[j] = o;
 168       }
 169
 170       @Override
 171       protected int compare(int i, int j) {
 172         final int ord1 = compact[i], ord2 = compact[j];
 173         assert bytesStart.length > ord1 && bytesStart.length > ord2;
 174         return comp.compare(pool.setBytesRef(scratch1, bytesStart[ord1]),
 175           pool.setBytesRef(scratch2, bytesStart[ord2]));
 176       }
 177
 178       @Override
 179       protected void setPivot(int i) {
 180         final int ord = compact[i];
 181         assert bytesStart.length > ord;
 182         pool.setBytesRef(pivot, bytesStart[ord]);
 183       }
 184
 185       @Override
 186       protected int comparePivot(int j) {
 187         final int ord = compact[j];
 188         assert bytesStart.length > ord;
 189         return comp.compare(pivot,
 190           pool.setBytesRef(scratch2, bytesStart[ord]));
 191       }
 192
 193       private final BytesRef pivot = new BytesRef(),
 194         scratch1 = new BytesRef(), scratch2 = new BytesRef();
 195     }.quickSort(0, count - 1);
 196     return compact;
 197   }
 198
 199   private boolean equals(int ord, BytesRef b) {
 200     return pool.setBytesRef(scratch1, bytesStart[ord]).bytesEquals(b);
 201   }
 202
 203   private boolean shrink(int targetSize) {
 204     // Cannot use ArrayUtil.shrink because we require power
 205     // of 2:
 206     int newSize = hashSize;
 207     while (newSize >= 8 && newSize / 4 > targetSize) {
 208       newSize /= 2;
 209     }
 210     if (newSize != hashSize) {
 211       bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT
 212           * -(hashSize - newSize));
 213       hashSize = newSize;
 214       ords = new int[hashSize];
 215       Arrays.fill(ords, -1);
 216       hashHalfSize = newSize / 2;
 217       hashMask = newSize - 1;
 218       return true;
 219     } else {
 220       return false;
 221     }
 222   }
 223
 224   /**
 225    * Clears the {@link BytesRef} which maps to the given {@link BytesRef}
 226    */
 227   public void clear(boolean resetPool) {
 228     lastCount = count;
 229     count = 0;
 230     if (resetPool) {
 231       pool.dropBuffersAndReset();
 232     }
 233     bytesStart = bytesStartArray.clear();
 234     if (lastCount != -1 && shrink(lastCount)) {
 235       // shrink clears the hash entries
 236       return;
 237     }
 238     Arrays.fill(ords, -1);
 239   }
 240
 241   public void clear() {
 242     clear(true);
 243   }
 244
 245   /**
 246    * Closes the BytesRefHash and releases all internally used memory
 247    */
 248   public void close() {
 249     clear(true);
 250     ords = null;
 251     bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT
 252         * -hashSize);
 253   }
 254
 255   /**
 256    * Adds a new {@link BytesRef}
 257    *
 258    * @param bytes
 259    *          the bytes to hash
 260    * @return the ord the given bytes are hashed if there was no mapping for the
 261    *         given bytes, otherwise <code>(-(ord)-1)</code>. This guarantees
 262    *         that the return value will always be &gt;= 0 if the given bytes
 263    *         haven't been hashed before.
 264    *
 265    * @throws MaxBytesLengthExceededException
 266    *           if the given bytes are > 2 +
 267    *           {@link ByteBlockPool#BYTE_BLOCK_SIZE}
 268    */
 269   public int add(BytesRef bytes) {
 270     return add(bytes, bytes.hashCode());
 271   }
 272
 273   /**
 274    * Adds a new {@link BytesRef} with a pre-calculated hash code.
 275    *
 276    * @param bytes
 277    *          the bytes to hash
 278    * @param code
 279    *          the bytes hash code
 280    *
 281    *          <p>
 282    *          Hashcode is defined as:
 283    *
 284    *          <pre>
 285    * int hash = 0;
 286    * for (int i = offset; i &lt; offset + length; i++) {
 287    *   hash = 31 * hash + bytes[i];
 288    * }
 289    * </pre>
 290    *
 291    * @return the ord the given bytes are hashed if there was no mapping for the
 292    *         given bytes, otherwise <code>(-(ord)-1)</code>. This guarantees
 293    *         that the return value will always be &gt;= 0 if the given bytes
 294    *         haven't been hashed before.
 295    *
 296    * @throws MaxBytesLengthExceededException
 297    *           if the given bytes are >
 298    *           {@link ByteBlockPool#BYTE_BLOCK_SIZE} - 2
 299    */
 300   public int add(BytesRef bytes, int code) {
 301     assert bytesStart != null : "Bytesstart is null - not initialized";
 302     final int length = bytes.length;
 303     // final position
 304     int hashPos = code & hashMask;
 305     int e = ords[hashPos];
 306     if (e != -1 && !equals(e, bytes)) {
 307       // Conflict: keep searching different locations in
 308       // the hash table.
 309       final int inc = ((code >> 8) + code) | 1;
 310       do {
 311         code += inc;
 312         hashPos = code & hashMask;
 313         e = ords[hashPos];
 314       } while (e != -1 && !equals(e, bytes));
 315     }
 316
 317     if (e == -1) {
 318       // new entry
 319       final int len2 = 2 + bytes.length;
 320       if (len2 + pool.byteUpto > BYTE_BLOCK_SIZE) {
 321         if (len2 > BYTE_BLOCK_SIZE) {
 322           throw new MaxBytesLengthExceededException("bytes can be at most "
 323               + (BYTE_BLOCK_SIZE - 2) + " in length; got " + bytes.length);
 324         }
 325         pool.nextBuffer();
 326       }
 327       final byte[] buffer = pool.buffer;
 328       final int bufferUpto = pool.byteUpto;
 329       if (count >= bytesStart.length) {
 330         bytesStart = bytesStartArray.grow();
 331         assert count < bytesStart.length + 1 : "count: " + count + " len: "
 332             + bytesStart.length;
 333       }
 334       e = count++;
 335
 336       bytesStart[e] = bufferUpto + pool.byteOffset;
 337
 338       // We first encode the length, followed by the
 339       // bytes. Length is encoded as vInt, but will consume
 340       // 1 or 2 bytes at most (we reject too-long terms,
 341       // above).
 342       if (length < 128) {
 343         // 1 byte to store length
 344         buffer[bufferUpto] = (byte) length;
 345         pool.byteUpto += length + 1;
 346         assert length >= 0: "Length must be positive: " + length;
 347         System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 1,
 348             length);
 349       } else {
 350         // 2 byte to store length
 351         buffer[bufferUpto] = (byte) (0x80 | (length & 0x7f));
 352         buffer[bufferUpto + 1] = (byte) ((length >> 7) & 0xff);
 353         pool.byteUpto += length + 2;
 354         System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 2,
 355             length);
 356       }
 357       assert ords[hashPos] == -1;
 358       ords[hashPos] = e;
 359
 360       if (count == hashHalfSize) {
 361         rehash(2 * hashSize, true);
 362       }
 363       return e;
 364     }
 365     return -(e + 1);
 366   }
 367
 368   public int addByPoolOffset(int offset) {
 369     assert bytesStart != null : "Bytesstart is null - not initialized";
 370     // final position
 371     int code = offset;
 372     int hashPos = offset & hashMask;
 373     int e = ords[hashPos];
 374     if (e != -1 && bytesStart[e] != offset) {
 375       // Conflict: keep searching different locations in
 376       // the hash table.
 377       final int inc = ((code >> 8) + code) | 1;
 378       do {
 379         code += inc;
 380         hashPos = code & hashMask;
 381         e = ords[hashPos];
 382       } while (e != -1 && bytesStart[e] != offset);
 383     }
 384     if (e == -1) {
 385       // new entry
 386       if (count >= bytesStart.length) {
 387         bytesStart = bytesStartArray.grow();
 388         assert count < bytesStart.length + 1 : "count: " + count + " len: "
 389             + bytesStart.length;
 390       }
 391       e = count++;
 392       bytesStart[e] = offset;
 393       assert ords[hashPos] == -1;
 394       ords[hashPos] = e;
 395
 396       if (count == hashHalfSize) {
 397         rehash(2 * hashSize, false);
 398       }
 399       return e;
 400     }
 401     return -(e + 1);
 402   }
 403
 404   /**
 405    * Called when hash is too small (> 50% occupied) or too large (< 20%
 406    * occupied).
 407    */
 408   private void rehash(final int newSize, boolean hashOnData) {
 409     final int newMask = newSize - 1;
 410     bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (newSize));
 411     final int[] newHash = new int[newSize];
 412     Arrays.fill(newHash, -1);
 413     for (int i = 0; i < hashSize; i++) {
 414       final int e0 = ords[i];
 415       if (e0 != -1) {
 416         int code;
 417         if (hashOnData) {
 418           final int off = bytesStart[e0];
 419           final int start = off & BYTE_BLOCK_MASK;
 420           final byte[] bytes = pool.buffers[off >> BYTE_BLOCK_SHIFT];
 421           code = 0;
 422           final int len;
 423           int pos;
 424           if ((bytes[start] & 0x80) == 0) {
 425             // length is 1 byte
 426             len = bytes[start];
 427             pos = start + 1;
 428           } else {
 429             len = (bytes[start] & 0x7f) + ((bytes[start + 1] & 0xff) << 7);
 430             pos = start + 2;
 431           }
 432
 433           final int endPos = pos + len;
 434           while (pos < endPos) {
 435             code = BytesRef.HASH_PRIME * code + bytes[pos++];
 436           }
 437         } else {
 438           code = bytesStart[e0];
 439         }
 440
 441         int hashPos = code & newMask;
 442         assert hashPos >= 0;
 443         if (newHash[hashPos] != -1) {
 444           final int inc = ((code >> 8) + code) | 1;
 445           do {
 446             code += inc;
 447             hashPos = code & newMask;
 448           } while (newHash[hashPos] != -1);
 449         }
 450         newHash[hashPos] = e0;
 451       }
 452     }
 453
 454     hashMask = newMask;
 455     bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (-ords.length));
 456     ords = newHash;
 457     hashSize = newSize;
 458     hashHalfSize = newSize / 2;
 459   }
 460
 461   /**
 462    * reinitializes the {@link BytesRefHash} after a previous {@link #clear()}
 463    * call. If {@link #clear()} has not been called previously this method has no
 464    * effect.
 465    */
 466   public void reinit() {
 467     if (bytesStart == null) {
 468       bytesStart = bytesStartArray.init();
 469     }
 470
 471     if (ords == null) {
 472       ords = new int[hashSize];
 473       bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * hashSize);
 474     }
 475   }
 476
 477   /**
 478    * Returns the bytesStart offset into the internally used
 479    * {@link ByteBlockPool} for the given ord
 480    *
 481    * @param ord
 482    *          the ord to look up
 483    * @return the bytesStart offset into the internally used
 484    *         {@link ByteBlockPool} for the given ord
 485    */
 486   public int byteStart(int ord) {
 487     assert bytesStart != null : "Bytesstart is null - not initialized";
 488     assert ord >= 0 && ord < count : ord;
 489     return bytesStart[ord];
 490   }
 491
 492   /**
 493    * Thrown if a {@link BytesRef} exceeds the {@link BytesRefHash} limit of
 494    * {@link ByteBlockPool#BYTE_BLOCK_SIZE}-2.
 495    */
 496   @SuppressWarnings("serial")
 497   public static class MaxBytesLengthExceededException extends RuntimeException {
 498     MaxBytesLengthExceededException(String message) {
 499       super(message);
 500     }
 501   }
 502
 503   public abstract static class BytesStartArray {
 504     /**
 505      * Initializes the BytesStartArray. This call will allocate memory
 506      *
 507      * @return the initialized bytes start array
 508      */
 509     public abstract int[] init();
 510
 511     /**
 512      * Grows the {@link BytesStartArray}
 513      *
 514      * @return the grown array
 515      */
 516     public abstract int[] grow();
 517
 518     /**
 519      * clears the {@link BytesStartArray} and returns the cleared instance.
 520      *
 521      * @return the cleared instance, this might be <code>null</code>
 522      */
 523     public abstract int[] clear();
 524
 525     /**
 526      * A {@link AtomicLong} reference holding the number of bytes used by this
 527      * {@link BytesStartArray}. The {@link BytesRefHash} uses this reference to
 528      * track it memory usage
 529      *
 530      * @return a {@link AtomicLong} reference holding the number of bytes used
 531      *         by this {@link BytesStartArray}.
 532      */
 533     public abstract AtomicLong bytesUsed();
 534   }
 535
 536   /**
 537    * A direct {@link BytesStartArray} that tracks all memory allocation using an {@link AtomicLong} instance.
 538    */
 539   public static class TrackingDirectBytesStartArray extends BytesStartArray {
 540     protected final int initSize;
 541     private int[] bytesStart;
 542     protected final AtomicLong bytesUsed;
 543
 544     public TrackingDirectBytesStartArray(int initSize, AtomicLong bytesUsed) {
 545       this.initSize = initSize;
 546       this.bytesUsed = bytesUsed;
 547     }
 548
 549     @Override
 550     public int[] clear() {
 551       if (bytesStart != null) {
 552         bytesUsed.addAndGet(-bytesStart.length * RamUsageEstimator.NUM_BYTES_INT);
 553       }
 554       return bytesStart = null;
 555     }
 556
 557     @Override
 558     public int[] grow() {
 559       assert bytesStart != null;
 560       final int oldSize = bytesStart.length;
 561       bytesStart = ArrayUtil.grow(bytesStart, bytesStart.length + 1);
 562       bytesUsed.addAndGet((bytesStart.length - oldSize) * RamUsageEstimator.NUM_BYTES_INT);
 563       return bytesStart;
 564     }
 565
 566     @Override
 567     public int[] init() {
 568       bytesStart = new int[ArrayUtil.oversize(initSize,
 569           RamUsageEstimator.NUM_BYTES_INT)];
 570       bytesUsed.addAndGet((bytesStart.length) * RamUsageEstimator.NUM_BYTES_INT);
 571       return bytesStart;
 572     }
 573
 574     @Override
 575     public AtomicLong bytesUsed() {
 576       return bytesUsed;
 577     }
 578   }
 579
 580   public static class DirectBytesStartArray extends BytesStartArray {
 581     protected final int initSize;
 582     private int[] bytesStart;
 583     private final AtomicLong bytesUsed;
 584
 585     public DirectBytesStartArray(int initSize) {
 586       this.bytesUsed = new AtomicLong(0);
 587       this.initSize = initSize;
 588     }
 589
 590
 591     @Override
 592     public int[] clear() {
 593       return bytesStart = null;
 594     }
 595
 596     @Override
 597     public int[] grow() {
 598       assert bytesStart != null;
 599       return bytesStart = ArrayUtil.grow(bytesStart, bytesStart.length + 1);
 600     }
 601
 602     @Override
 603     public int[] init() {
 604       return bytesStart = new int[ArrayUtil.oversize(initSize,
 605           RamUsageEstimator.NUM_BYTES_INT)];
 606     }
 607
 608     @Override
 609     public AtomicLong bytesUsed() {
 610       return bytesUsed;
 611     }
 612   }
 613 }