X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/src/java/org/apache/lucene/util/BytesRefHash.java diff --git a/lucene-java-3.4.0/lucene/src/java/org/apache/lucene/util/BytesRefHash.java b/lucene-java-3.4.0/lucene/src/java/org/apache/lucene/util/BytesRefHash.java deleted file mode 100644 index 2fdb32e..0000000 --- a/lucene-java-3.4.0/lucene/src/java/org/apache/lucene/util/BytesRefHash.java +++ /dev/null @@ -1,613 +0,0 @@ -package org.apache.lucene.util; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_MASK; -import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SHIFT; -import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE; - -import java.util.Arrays; -import java.util.Comparator; -import java.util.concurrent.atomic.AtomicLong; - -import org.apache.lucene.util.ByteBlockPool.DirectAllocator; - -/** - * {@link BytesRefHash} is a special purpose hash-map like data-structure - * optimized for {@link BytesRef} instances. BytesRefHash maintains mappings of - * byte arrays to ordinal (Map) storing the hashed bytes - * efficiently in continuous storage. The mapping to the ordinal is - * encapsulated inside {@link BytesRefHash} and is guaranteed to be increased - * for each added {@link BytesRef}. - * - *

- * Note: The maximum capacity {@link BytesRef} instance passed to - * {@link #add(BytesRef)} must not be longer than {@link ByteBlockPool#BYTE_BLOCK_SIZE}-2. - * The internal storage is limited to 2GB total byte storage. - *

- * - * @lucene.internal - */ -public final class BytesRefHash { - - public static final int DEFAULT_CAPACITY = 16; - - // the following fields are needed by comparator, - // so package private to prevent access$-methods: - final ByteBlockPool pool; - int[] bytesStart; - - private final BytesRef scratch1 = new BytesRef(); - private int hashSize; - private int hashHalfSize; - private int hashMask; - private int count; - private int lastCount = -1; - private int[] ords; - private final BytesStartArray bytesStartArray; - private AtomicLong bytesUsed; - - /** - * Creates a new {@link BytesRefHash} with a {@link ByteBlockPool} using a - * {@link DirectAllocator}. - */ - public BytesRefHash() { - this(new ByteBlockPool(new DirectAllocator())); - } - - /** - * Creates a new {@link BytesRefHash} - */ - public BytesRefHash(ByteBlockPool pool) { - this(pool, DEFAULT_CAPACITY, new DirectBytesStartArray(DEFAULT_CAPACITY)); - } - - /** - * Creates a new {@link BytesRefHash} - */ - public BytesRefHash(ByteBlockPool pool, int capacity, - BytesStartArray bytesStartArray) { - hashSize = capacity; - hashHalfSize = hashSize >> 1; - hashMask = hashSize - 1; - this.pool = pool; - ords = new int[hashSize]; - Arrays.fill(ords, -1); - this.bytesStartArray = bytesStartArray; - bytesStart = bytesStartArray.init(); - bytesUsed = bytesStartArray.bytesUsed() == null? new AtomicLong(0) : bytesStartArray.bytesUsed();; - bytesUsed.addAndGet(hashSize * RamUsageEstimator.NUM_BYTES_INT); - } - - /** - * Returns the number of {@link BytesRef} values in this {@link BytesRefHash}. - * - * @return the number of {@link BytesRef} values in this {@link BytesRefHash}. - */ - public int size() { - return count; - } - - /** - * Populates and returns a {@link BytesRef} with the bytes for the given ord. - *

- * Note: the given ord must be a positive integer less that the current size ( - * {@link #size()}) - *

- * - * @param ord the ord - * @param ref the {@link BytesRef} to populate - * - * @return the given BytesRef instance populated with the bytes for the given ord - */ - public BytesRef get(int ord, BytesRef ref) { - assert bytesStart != null : "bytesStart is null - not initialized"; - assert ord < bytesStart.length: "ord exceeds byteStart len: " + bytesStart.length; - return pool.setBytesRef(ref, bytesStart[ord]); - } - - /** - * Returns the ords array in arbitrary order. Valid ords start at offset of 0 - * and end at a limit of {@link #size()} - 1 - *

- * Note: This is a destructive operation. {@link #clear()} must be called in - * order to reuse this {@link BytesRefHash} instance. - *

- */ - public int[] compact() { - assert bytesStart != null : "Bytesstart is null - not initialized"; - int upto = 0; - for (int i = 0; i < hashSize; i++) { - if (ords[i] != -1) { - if (upto < i) { - ords[upto] = ords[i]; - ords[i] = -1; - } - upto++; - } - } - - assert upto == count; - lastCount = count; - return ords; - } - - /** - * Returns the values array sorted by the referenced byte values. - *

- * Note: This is a destructive operation. {@link #clear()} must be called in - * order to reuse this {@link BytesRefHash} instance. - *

- * - * @param comp - * the {@link Comparator} used for sorting - */ - public int[] sort(final Comparator comp) { - final int[] compact = compact(); - new SorterTemplate() { - @Override - protected void swap(int i, int j) { - final int o = compact[i]; - compact[i] = compact[j]; - compact[j] = o; - } - - @Override - protected int compare(int i, int j) { - final int ord1 = compact[i], ord2 = compact[j]; - assert bytesStart.length > ord1 && bytesStart.length > ord2; - return comp.compare(pool.setBytesRef(scratch1, bytesStart[ord1]), - pool.setBytesRef(scratch2, bytesStart[ord2])); - } - - @Override - protected void setPivot(int i) { - final int ord = compact[i]; - assert bytesStart.length > ord; - pool.setBytesRef(pivot, bytesStart[ord]); - } - - @Override - protected int comparePivot(int j) { - final int ord = compact[j]; - assert bytesStart.length > ord; - return comp.compare(pivot, - pool.setBytesRef(scratch2, bytesStart[ord])); - } - - private final BytesRef pivot = new BytesRef(), - scratch1 = new BytesRef(), scratch2 = new BytesRef(); - }.quickSort(0, count - 1); - return compact; - } - - private boolean equals(int ord, BytesRef b) { - return pool.setBytesRef(scratch1, bytesStart[ord]).bytesEquals(b); - } - - private boolean shrink(int targetSize) { - // Cannot use ArrayUtil.shrink because we require power - // of 2: - int newSize = hashSize; - while (newSize >= 8 && newSize / 4 > targetSize) { - newSize /= 2; - } - if (newSize != hashSize) { - bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT - * -(hashSize - newSize)); - hashSize = newSize; - ords = new int[hashSize]; - Arrays.fill(ords, -1); - hashHalfSize = newSize / 2; - hashMask = newSize - 1; - return true; - } else { - return false; - } - } - - /** - * Clears the {@link BytesRef} which maps to the given {@link BytesRef} - */ - public void clear(boolean resetPool) { - lastCount = count; - count = 0; - if (resetPool) { - pool.dropBuffersAndReset(); - } - bytesStart = bytesStartArray.clear(); - if (lastCount != -1 && shrink(lastCount)) { - // shrink clears the hash entries - return; - } - Arrays.fill(ords, -1); - } - - public void clear() { - clear(true); - } - - /** - * Closes the BytesRefHash and releases all internally used memory - */ - public void close() { - clear(true); - ords = null; - bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT - * -hashSize); - } - - /** - * Adds a new {@link BytesRef} - * - * @param bytes - * the bytes to hash - * @return the ord the given bytes are hashed if there was no mapping for the - * given bytes, otherwise (-(ord)-1). This guarantees - * that the return value will always be >= 0 if the given bytes - * haven't been hashed before. - * - * @throws MaxBytesLengthExceededException - * if the given bytes are > 2 + - * {@link ByteBlockPool#BYTE_BLOCK_SIZE} - */ - public int add(BytesRef bytes) { - return add(bytes, bytes.hashCode()); - } - - /** - * Adds a new {@link BytesRef} with a pre-calculated hash code. - * - * @param bytes - * the bytes to hash - * @param code - * the bytes hash code - * - *

- * Hashcode is defined as: - * - *

-   * int hash = 0;
-   * for (int i = offset; i < offset + length; i++) {
-   *   hash = 31 * hash + bytes[i];
-   * }
-   * 
- * - * @return the ord the given bytes are hashed if there was no mapping for the - * given bytes, otherwise (-(ord)-1). This guarantees - * that the return value will always be >= 0 if the given bytes - * haven't been hashed before. - * - * @throws MaxBytesLengthExceededException - * if the given bytes are > - * {@link ByteBlockPool#BYTE_BLOCK_SIZE} - 2 - */ - public int add(BytesRef bytes, int code) { - assert bytesStart != null : "Bytesstart is null - not initialized"; - final int length = bytes.length; - // final position - int hashPos = code & hashMask; - int e = ords[hashPos]; - if (e != -1 && !equals(e, bytes)) { - // Conflict: keep searching different locations in - // the hash table. - final int inc = ((code >> 8) + code) | 1; - do { - code += inc; - hashPos = code & hashMask; - e = ords[hashPos]; - } while (e != -1 && !equals(e, bytes)); - } - - if (e == -1) { - // new entry - final int len2 = 2 + bytes.length; - if (len2 + pool.byteUpto > BYTE_BLOCK_SIZE) { - if (len2 > BYTE_BLOCK_SIZE) { - throw new MaxBytesLengthExceededException("bytes can be at most " - + (BYTE_BLOCK_SIZE - 2) + " in length; got " + bytes.length); - } - pool.nextBuffer(); - } - final byte[] buffer = pool.buffer; - final int bufferUpto = pool.byteUpto; - if (count >= bytesStart.length) { - bytesStart = bytesStartArray.grow(); - assert count < bytesStart.length + 1 : "count: " + count + " len: " - + bytesStart.length; - } - e = count++; - - bytesStart[e] = bufferUpto + pool.byteOffset; - - // We first encode the length, followed by the - // bytes. Length is encoded as vInt, but will consume - // 1 or 2 bytes at most (we reject too-long terms, - // above). - if (length < 128) { - // 1 byte to store length - buffer[bufferUpto] = (byte) length; - pool.byteUpto += length + 1; - assert length >= 0: "Length must be positive: " + length; - System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 1, - length); - } else { - // 2 byte to store length - buffer[bufferUpto] = (byte) (0x80 | (length & 0x7f)); - buffer[bufferUpto + 1] = (byte) ((length >> 7) & 0xff); - pool.byteUpto += length + 2; - System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 2, - length); - } - assert ords[hashPos] == -1; - ords[hashPos] = e; - - if (count == hashHalfSize) { - rehash(2 * hashSize, true); - } - return e; - } - return -(e + 1); - } - - public int addByPoolOffset(int offset) { - assert bytesStart != null : "Bytesstart is null - not initialized"; - // final position - int code = offset; - int hashPos = offset & hashMask; - int e = ords[hashPos]; - if (e != -1 && bytesStart[e] != offset) { - // Conflict: keep searching different locations in - // the hash table. - final int inc = ((code >> 8) + code) | 1; - do { - code += inc; - hashPos = code & hashMask; - e = ords[hashPos]; - } while (e != -1 && bytesStart[e] != offset); - } - if (e == -1) { - // new entry - if (count >= bytesStart.length) { - bytesStart = bytesStartArray.grow(); - assert count < bytesStart.length + 1 : "count: " + count + " len: " - + bytesStart.length; - } - e = count++; - bytesStart[e] = offset; - assert ords[hashPos] == -1; - ords[hashPos] = e; - - if (count == hashHalfSize) { - rehash(2 * hashSize, false); - } - return e; - } - return -(e + 1); - } - - /** - * Called when hash is too small (> 50% occupied) or too large (< 20% - * occupied). - */ - private void rehash(final int newSize, boolean hashOnData) { - final int newMask = newSize - 1; - bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (newSize)); - final int[] newHash = new int[newSize]; - Arrays.fill(newHash, -1); - for (int i = 0; i < hashSize; i++) { - final int e0 = ords[i]; - if (e0 != -1) { - int code; - if (hashOnData) { - final int off = bytesStart[e0]; - final int start = off & BYTE_BLOCK_MASK; - final byte[] bytes = pool.buffers[off >> BYTE_BLOCK_SHIFT]; - code = 0; - final int len; - int pos; - if ((bytes[start] & 0x80) == 0) { - // length is 1 byte - len = bytes[start]; - pos = start + 1; - } else { - len = (bytes[start] & 0x7f) + ((bytes[start + 1] & 0xff) << 7); - pos = start + 2; - } - - final int endPos = pos + len; - while (pos < endPos) { - code = BytesRef.HASH_PRIME * code + bytes[pos++]; - } - } else { - code = bytesStart[e0]; - } - - int hashPos = code & newMask; - assert hashPos >= 0; - if (newHash[hashPos] != -1) { - final int inc = ((code >> 8) + code) | 1; - do { - code += inc; - hashPos = code & newMask; - } while (newHash[hashPos] != -1); - } - newHash[hashPos] = e0; - } - } - - hashMask = newMask; - bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (-ords.length)); - ords = newHash; - hashSize = newSize; - hashHalfSize = newSize / 2; - } - - /** - * reinitializes the {@link BytesRefHash} after a previous {@link #clear()} - * call. If {@link #clear()} has not been called previously this method has no - * effect. - */ - public void reinit() { - if (bytesStart == null) { - bytesStart = bytesStartArray.init(); - } - - if (ords == null) { - ords = new int[hashSize]; - bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * hashSize); - } - } - - /** - * Returns the bytesStart offset into the internally used - * {@link ByteBlockPool} for the given ord - * - * @param ord - * the ord to look up - * @return the bytesStart offset into the internally used - * {@link ByteBlockPool} for the given ord - */ - public int byteStart(int ord) { - assert bytesStart != null : "Bytesstart is null - not initialized"; - assert ord >= 0 && ord < count : ord; - return bytesStart[ord]; - } - - /** - * Thrown if a {@link BytesRef} exceeds the {@link BytesRefHash} limit of - * {@link ByteBlockPool#BYTE_BLOCK_SIZE}-2. - */ - @SuppressWarnings("serial") - public static class MaxBytesLengthExceededException extends RuntimeException { - MaxBytesLengthExceededException(String message) { - super(message); - } - } - - public abstract static class BytesStartArray { - /** - * Initializes the BytesStartArray. This call will allocate memory - * - * @return the initialized bytes start array - */ - public abstract int[] init(); - - /** - * Grows the {@link BytesStartArray} - * - * @return the grown array - */ - public abstract int[] grow(); - - /** - * clears the {@link BytesStartArray} and returns the cleared instance. - * - * @return the cleared instance, this might be null - */ - public abstract int[] clear(); - - /** - * A {@link AtomicLong} reference holding the number of bytes used by this - * {@link BytesStartArray}. The {@link BytesRefHash} uses this reference to - * track it memory usage - * - * @return a {@link AtomicLong} reference holding the number of bytes used - * by this {@link BytesStartArray}. - */ - public abstract AtomicLong bytesUsed(); - } - - /** - * A direct {@link BytesStartArray} that tracks all memory allocation using an {@link AtomicLong} instance. - */ - public static class TrackingDirectBytesStartArray extends BytesStartArray { - protected final int initSize; - private int[] bytesStart; - protected final AtomicLong bytesUsed; - - public TrackingDirectBytesStartArray(int initSize, AtomicLong bytesUsed) { - this.initSize = initSize; - this.bytesUsed = bytesUsed; - } - - @Override - public int[] clear() { - if (bytesStart != null) { - bytesUsed.addAndGet(-bytesStart.length * RamUsageEstimator.NUM_BYTES_INT); - } - return bytesStart = null; - } - - @Override - public int[] grow() { - assert bytesStart != null; - final int oldSize = bytesStart.length; - bytesStart = ArrayUtil.grow(bytesStart, bytesStart.length + 1); - bytesUsed.addAndGet((bytesStart.length - oldSize) * RamUsageEstimator.NUM_BYTES_INT); - return bytesStart; - } - - @Override - public int[] init() { - bytesStart = new int[ArrayUtil.oversize(initSize, - RamUsageEstimator.NUM_BYTES_INT)]; - bytesUsed.addAndGet((bytesStart.length) * RamUsageEstimator.NUM_BYTES_INT); - return bytesStart; - } - - @Override - public AtomicLong bytesUsed() { - return bytesUsed; - } - } - - public static class DirectBytesStartArray extends BytesStartArray { - protected final int initSize; - private int[] bytesStart; - private final AtomicLong bytesUsed; - - public DirectBytesStartArray(int initSize) { - this.bytesUsed = new AtomicLong(0); - this.initSize = initSize; - } - - - @Override - public int[] clear() { - return bytesStart = null; - } - - @Override - public int[] grow() { - assert bytesStart != null; - return bytesStart = ArrayUtil.grow(bytesStart, bytesStart.length + 1); - } - - @Override - public int[] init() { - return bytesStart = new int[ArrayUtil.oversize(initSize, - RamUsageEstimator.NUM_BYTES_INT)]; - } - - @Override - public AtomicLong bytesUsed() { - return bytesUsed; - } - } -}