--- /dev/null
+package org.apache.lucene.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_MASK;
+import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SHIFT;
+import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
+
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.lucene.util.ByteBlockPool.DirectAllocator;
+
+/**
+ * {@link BytesRefHash} is a special purpose hash-map like data-structure
+ * optimized for {@link BytesRef} instances. BytesRefHash maintains mappings of
+ * byte arrays to ordinal (Map<BytesRef,int>) storing the hashed bytes
+ * efficiently in continuous storage. The mapping to the ordinal is
+ * encapsulated inside {@link BytesRefHash} and is guaranteed to be increased
+ * for each added {@link BytesRef}.
+ *
+ * <p>
+ * Note: The maximum capacity {@link BytesRef} instance passed to
+ * {@link #add(BytesRef)} must not be longer than {@link ByteBlockPool#BYTE_BLOCK_SIZE}-2.
+ * The internal storage is limited to 2GB total byte storage.
+ * </p>
+ *
+ * @lucene.internal
+ */
+public final class BytesRefHash {
+
+ public static final int DEFAULT_CAPACITY = 16;
+
+ // the following fields are needed by comparator,
+ // so package private to prevent access$-methods:
+ final ByteBlockPool pool;
+ int[] bytesStart;
+
+ private final BytesRef scratch1 = new BytesRef();
+ private int hashSize;
+ private int hashHalfSize;
+ private int hashMask;
+ private int count;
+ private int lastCount = -1;
+ private int[] ords;
+ private final BytesStartArray bytesStartArray;
+ private AtomicLong bytesUsed;
+
+ /**
+ * Creates a new {@link BytesRefHash} with a {@link ByteBlockPool} using a
+ * {@link DirectAllocator}.
+ */
+ public BytesRefHash() {
+ this(new ByteBlockPool(new DirectAllocator()));
+ }
+
+ /**
+ * Creates a new {@link BytesRefHash}
+ */
+ public BytesRefHash(ByteBlockPool pool) {
+ this(pool, DEFAULT_CAPACITY, new DirectBytesStartArray(DEFAULT_CAPACITY));
+ }
+
+ /**
+ * Creates a new {@link BytesRefHash}
+ */
+ public BytesRefHash(ByteBlockPool pool, int capacity,
+ BytesStartArray bytesStartArray) {
+ hashSize = capacity;
+ hashHalfSize = hashSize >> 1;
+ hashMask = hashSize - 1;
+ this.pool = pool;
+ ords = new int[hashSize];
+ Arrays.fill(ords, -1);
+ this.bytesStartArray = bytesStartArray;
+ bytesStart = bytesStartArray.init();
+ bytesUsed = bytesStartArray.bytesUsed() == null? new AtomicLong(0) : bytesStartArray.bytesUsed();;
+ bytesUsed.addAndGet(hashSize * RamUsageEstimator.NUM_BYTES_INT);
+ }
+
+ /**
+ * Returns the number of {@link BytesRef} values in this {@link BytesRefHash}.
+ *
+ * @return the number of {@link BytesRef} values in this {@link BytesRefHash}.
+ */
+ public int size() {
+ return count;
+ }
+
+ /**
+ * Populates and returns a {@link BytesRef} with the bytes for the given ord.
+ * <p>
+ * Note: the given ord must be a positive integer less that the current size (
+ * {@link #size()})
+ * </p>
+ *
+ * @param ord the ord
+ * @param ref the {@link BytesRef} to populate
+ *
+ * @return the given BytesRef instance populated with the bytes for the given ord
+ */
+ public BytesRef get(int ord, BytesRef ref) {
+ assert bytesStart != null : "bytesStart is null - not initialized";
+ assert ord < bytesStart.length: "ord exceeds byteStart len: " + bytesStart.length;
+ return pool.setBytesRef(ref, bytesStart[ord]);
+ }
+
+ /**
+ * Returns the ords array in arbitrary order. Valid ords start at offset of 0
+ * and end at a limit of {@link #size()} - 1
+ * <p>
+ * Note: This is a destructive operation. {@link #clear()} must be called in
+ * order to reuse this {@link BytesRefHash} instance.
+ * </p>
+ */
+ public int[] compact() {
+ assert bytesStart != null : "Bytesstart is null - not initialized";
+ int upto = 0;
+ for (int i = 0; i < hashSize; i++) {
+ if (ords[i] != -1) {
+ if (upto < i) {
+ ords[upto] = ords[i];
+ ords[i] = -1;
+ }
+ upto++;
+ }
+ }
+
+ assert upto == count;
+ lastCount = count;
+ return ords;
+ }
+
+ /**
+ * Returns the values array sorted by the referenced byte values.
+ * <p>
+ * Note: This is a destructive operation. {@link #clear()} must be called in
+ * order to reuse this {@link BytesRefHash} instance.
+ * </p>
+ *
+ * @param comp
+ * the {@link Comparator} used for sorting
+ */
+ public int[] sort(final Comparator<BytesRef> comp) {
+ final int[] compact = compact();
+ new SorterTemplate() {
+ @Override
+ protected void swap(int i, int j) {
+ final int o = compact[i];
+ compact[i] = compact[j];
+ compact[j] = o;
+ }
+
+ @Override
+ protected int compare(int i, int j) {
+ final int ord1 = compact[i], ord2 = compact[j];
+ assert bytesStart.length > ord1 && bytesStart.length > ord2;
+ return comp.compare(pool.setBytesRef(scratch1, bytesStart[ord1]),
+ pool.setBytesRef(scratch2, bytesStart[ord2]));
+ }
+
+ @Override
+ protected void setPivot(int i) {
+ final int ord = compact[i];
+ assert bytesStart.length > ord;
+ pool.setBytesRef(pivot, bytesStart[ord]);
+ }
+
+ @Override
+ protected int comparePivot(int j) {
+ final int ord = compact[j];
+ assert bytesStart.length > ord;
+ return comp.compare(pivot,
+ pool.setBytesRef(scratch2, bytesStart[ord]));
+ }
+
+ private final BytesRef pivot = new BytesRef(),
+ scratch1 = new BytesRef(), scratch2 = new BytesRef();
+ }.quickSort(0, count - 1);
+ return compact;
+ }
+
+ private boolean equals(int ord, BytesRef b) {
+ return pool.setBytesRef(scratch1, bytesStart[ord]).bytesEquals(b);
+ }
+
+ private boolean shrink(int targetSize) {
+ // Cannot use ArrayUtil.shrink because we require power
+ // of 2:
+ int newSize = hashSize;
+ while (newSize >= 8 && newSize / 4 > targetSize) {
+ newSize /= 2;
+ }
+ if (newSize != hashSize) {
+ bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT
+ * -(hashSize - newSize));
+ hashSize = newSize;
+ ords = new int[hashSize];
+ Arrays.fill(ords, -1);
+ hashHalfSize = newSize / 2;
+ hashMask = newSize - 1;
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * Clears the {@link BytesRef} which maps to the given {@link BytesRef}
+ */
+ public void clear(boolean resetPool) {
+ lastCount = count;
+ count = 0;
+ if (resetPool) {
+ pool.dropBuffersAndReset();
+ }
+ bytesStart = bytesStartArray.clear();
+ if (lastCount != -1 && shrink(lastCount)) {
+ // shrink clears the hash entries
+ return;
+ }
+ Arrays.fill(ords, -1);
+ }
+
+ public void clear() {
+ clear(true);
+ }
+
+ /**
+ * Closes the BytesRefHash and releases all internally used memory
+ */
+ public void close() {
+ clear(true);
+ ords = null;
+ bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT
+ * -hashSize);
+ }
+
+ /**
+ * Adds a new {@link BytesRef}
+ *
+ * @param bytes
+ * the bytes to hash
+ * @return the ord the given bytes are hashed if there was no mapping for the
+ * given bytes, otherwise <code>(-(ord)-1)</code>. This guarantees
+ * that the return value will always be >= 0 if the given bytes
+ * haven't been hashed before.
+ *
+ * @throws MaxBytesLengthExceededException
+ * if the given bytes are > 2 +
+ * {@link ByteBlockPool#BYTE_BLOCK_SIZE}
+ */
+ public int add(BytesRef bytes) {
+ return add(bytes, bytes.hashCode());
+ }
+
+ /**
+ * Adds a new {@link BytesRef} with a pre-calculated hash code.
+ *
+ * @param bytes
+ * the bytes to hash
+ * @param code
+ * the bytes hash code
+ *
+ * <p>
+ * Hashcode is defined as:
+ *
+ * <pre>
+ * int hash = 0;
+ * for (int i = offset; i < offset + length; i++) {
+ * hash = 31 * hash + bytes[i];
+ * }
+ * </pre>
+ *
+ * @return the ord the given bytes are hashed if there was no mapping for the
+ * given bytes, otherwise <code>(-(ord)-1)</code>. This guarantees
+ * that the return value will always be >= 0 if the given bytes
+ * haven't been hashed before.
+ *
+ * @throws MaxBytesLengthExceededException
+ * if the given bytes are >
+ * {@link ByteBlockPool#BYTE_BLOCK_SIZE} - 2
+ */
+ public int add(BytesRef bytes, int code) {
+ assert bytesStart != null : "Bytesstart is null - not initialized";
+ final int length = bytes.length;
+ // final position
+ int hashPos = code & hashMask;
+ int e = ords[hashPos];
+ if (e != -1 && !equals(e, bytes)) {
+ // Conflict: keep searching different locations in
+ // the hash table.
+ final int inc = ((code >> 8) + code) | 1;
+ do {
+ code += inc;
+ hashPos = code & hashMask;
+ e = ords[hashPos];
+ } while (e != -1 && !equals(e, bytes));
+ }
+
+ if (e == -1) {
+ // new entry
+ final int len2 = 2 + bytes.length;
+ if (len2 + pool.byteUpto > BYTE_BLOCK_SIZE) {
+ if (len2 > BYTE_BLOCK_SIZE) {
+ throw new MaxBytesLengthExceededException("bytes can be at most "
+ + (BYTE_BLOCK_SIZE - 2) + " in length; got " + bytes.length);
+ }
+ pool.nextBuffer();
+ }
+ final byte[] buffer = pool.buffer;
+ final int bufferUpto = pool.byteUpto;
+ if (count >= bytesStart.length) {
+ bytesStart = bytesStartArray.grow();
+ assert count < bytesStart.length + 1 : "count: " + count + " len: "
+ + bytesStart.length;
+ }
+ e = count++;
+
+ bytesStart[e] = bufferUpto + pool.byteOffset;
+
+ // We first encode the length, followed by the
+ // bytes. Length is encoded as vInt, but will consume
+ // 1 or 2 bytes at most (we reject too-long terms,
+ // above).
+ if (length < 128) {
+ // 1 byte to store length
+ buffer[bufferUpto] = (byte) length;
+ pool.byteUpto += length + 1;
+ assert length >= 0: "Length must be positive: " + length;
+ System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 1,
+ length);
+ } else {
+ // 2 byte to store length
+ buffer[bufferUpto] = (byte) (0x80 | (length & 0x7f));
+ buffer[bufferUpto + 1] = (byte) ((length >> 7) & 0xff);
+ pool.byteUpto += length + 2;
+ System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 2,
+ length);
+ }
+ assert ords[hashPos] == -1;
+ ords[hashPos] = e;
+
+ if (count == hashHalfSize) {
+ rehash(2 * hashSize, true);
+ }
+ return e;
+ }
+ return -(e + 1);
+ }
+
+ public int addByPoolOffset(int offset) {
+ assert bytesStart != null : "Bytesstart is null - not initialized";
+ // final position
+ int code = offset;
+ int hashPos = offset & hashMask;
+ int e = ords[hashPos];
+ if (e != -1 && bytesStart[e] != offset) {
+ // Conflict: keep searching different locations in
+ // the hash table.
+ final int inc = ((code >> 8) + code) | 1;
+ do {
+ code += inc;
+ hashPos = code & hashMask;
+ e = ords[hashPos];
+ } while (e != -1 && bytesStart[e] != offset);
+ }
+ if (e == -1) {
+ // new entry
+ if (count >= bytesStart.length) {
+ bytesStart = bytesStartArray.grow();
+ assert count < bytesStart.length + 1 : "count: " + count + " len: "
+ + bytesStart.length;
+ }
+ e = count++;
+ bytesStart[e] = offset;
+ assert ords[hashPos] == -1;
+ ords[hashPos] = e;
+
+ if (count == hashHalfSize) {
+ rehash(2 * hashSize, false);
+ }
+ return e;
+ }
+ return -(e + 1);
+ }
+
+ /**
+ * Called when hash is too small (> 50% occupied) or too large (< 20%
+ * occupied).
+ */
+ private void rehash(final int newSize, boolean hashOnData) {
+ final int newMask = newSize - 1;
+ bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (newSize));
+ final int[] newHash = new int[newSize];
+ Arrays.fill(newHash, -1);
+ for (int i = 0; i < hashSize; i++) {
+ final int e0 = ords[i];
+ if (e0 != -1) {
+ int code;
+ if (hashOnData) {
+ final int off = bytesStart[e0];
+ final int start = off & BYTE_BLOCK_MASK;
+ final byte[] bytes = pool.buffers[off >> BYTE_BLOCK_SHIFT];
+ code = 0;
+ final int len;
+ int pos;
+ if ((bytes[start] & 0x80) == 0) {
+ // length is 1 byte
+ len = bytes[start];
+ pos = start + 1;
+ } else {
+ len = (bytes[start] & 0x7f) + ((bytes[start + 1] & 0xff) << 7);
+ pos = start + 2;
+ }
+
+ final int endPos = pos + len;
+ while (pos < endPos) {
+ code = BytesRef.HASH_PRIME * code + bytes[pos++];
+ }
+ } else {
+ code = bytesStart[e0];
+ }
+
+ int hashPos = code & newMask;
+ assert hashPos >= 0;
+ if (newHash[hashPos] != -1) {
+ final int inc = ((code >> 8) + code) | 1;
+ do {
+ code += inc;
+ hashPos = code & newMask;
+ } while (newHash[hashPos] != -1);
+ }
+ newHash[hashPos] = e0;
+ }
+ }
+
+ hashMask = newMask;
+ bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (-ords.length));
+ ords = newHash;
+ hashSize = newSize;
+ hashHalfSize = newSize / 2;
+ }
+
+ /**
+ * reinitializes the {@link BytesRefHash} after a previous {@link #clear()}
+ * call. If {@link #clear()} has not been called previously this method has no
+ * effect.
+ */
+ public void reinit() {
+ if (bytesStart == null) {
+ bytesStart = bytesStartArray.init();
+ }
+
+ if (ords == null) {
+ ords = new int[hashSize];
+ bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * hashSize);
+ }
+ }
+
+ /**
+ * Returns the bytesStart offset into the internally used
+ * {@link ByteBlockPool} for the given ord
+ *
+ * @param ord
+ * the ord to look up
+ * @return the bytesStart offset into the internally used
+ * {@link ByteBlockPool} for the given ord
+ */
+ public int byteStart(int ord) {
+ assert bytesStart != null : "Bytesstart is null - not initialized";
+ assert ord >= 0 && ord < count : ord;
+ return bytesStart[ord];
+ }
+
+ /**
+ * Thrown if a {@link BytesRef} exceeds the {@link BytesRefHash} limit of
+ * {@link ByteBlockPool#BYTE_BLOCK_SIZE}-2.
+ */
+ @SuppressWarnings("serial")
+ public static class MaxBytesLengthExceededException extends RuntimeException {
+ MaxBytesLengthExceededException(String message) {
+ super(message);
+ }
+ }
+
+ public abstract static class BytesStartArray {
+ /**
+ * Initializes the BytesStartArray. This call will allocate memory
+ *
+ * @return the initialized bytes start array
+ */
+ public abstract int[] init();
+
+ /**
+ * Grows the {@link BytesStartArray}
+ *
+ * @return the grown array
+ */
+ public abstract int[] grow();
+
+ /**
+ * clears the {@link BytesStartArray} and returns the cleared instance.
+ *
+ * @return the cleared instance, this might be <code>null</code>
+ */
+ public abstract int[] clear();
+
+ /**
+ * A {@link AtomicLong} reference holding the number of bytes used by this
+ * {@link BytesStartArray}. The {@link BytesRefHash} uses this reference to
+ * track it memory usage
+ *
+ * @return a {@link AtomicLong} reference holding the number of bytes used
+ * by this {@link BytesStartArray}.
+ */
+ public abstract AtomicLong bytesUsed();
+ }
+
+ /**
+ * A direct {@link BytesStartArray} that tracks all memory allocation using an {@link AtomicLong} instance.
+ */
+ public static class TrackingDirectBytesStartArray extends BytesStartArray {
+ protected final int initSize;
+ private int[] bytesStart;
+ protected final AtomicLong bytesUsed;
+
+ public TrackingDirectBytesStartArray(int initSize, AtomicLong bytesUsed) {
+ this.initSize = initSize;
+ this.bytesUsed = bytesUsed;
+ }
+
+ @Override
+ public int[] clear() {
+ if (bytesStart != null) {
+ bytesUsed.addAndGet(-bytesStart.length * RamUsageEstimator.NUM_BYTES_INT);
+ }
+ return bytesStart = null;
+ }
+
+ @Override
+ public int[] grow() {
+ assert bytesStart != null;
+ final int oldSize = bytesStart.length;
+ bytesStart = ArrayUtil.grow(bytesStart, bytesStart.length + 1);
+ bytesUsed.addAndGet((bytesStart.length - oldSize) * RamUsageEstimator.NUM_BYTES_INT);
+ return bytesStart;
+ }
+
+ @Override
+ public int[] init() {
+ bytesStart = new int[ArrayUtil.oversize(initSize,
+ RamUsageEstimator.NUM_BYTES_INT)];
+ bytesUsed.addAndGet((bytesStart.length) * RamUsageEstimator.NUM_BYTES_INT);
+ return bytesStart;
+ }
+
+ @Override
+ public AtomicLong bytesUsed() {
+ return bytesUsed;
+ }
+ }
+
+ public static class DirectBytesStartArray extends BytesStartArray {
+ protected final int initSize;
+ private int[] bytesStart;
+ private final AtomicLong bytesUsed;
+
+ public DirectBytesStartArray(int initSize) {
+ this.bytesUsed = new AtomicLong(0);
+ this.initSize = initSize;
+ }
+
+
+ @Override
+ public int[] clear() {
+ return bytesStart = null;
+ }
+
+ @Override
+ public int[] grow() {
+ assert bytesStart != null;
+ return bytesStart = ArrayUtil.grow(bytesStart, bytesStart.length + 1);
+ }
+
+ @Override
+ public int[] init() {
+ return bytesStart = new int[ArrayUtil.oversize(initSize,
+ RamUsageEstimator.NUM_BYTES_INT)];
+ }
+
+ @Override
+ public AtomicLong bytesUsed() {
+ return bytesUsed;
+ }
+ }
+}