+++ /dev/null
-package org.apache.lucene.facet.taxonomy.writercache.cl2o;
-
-import java.io.BufferedInputStream;
-import java.io.BufferedOutputStream;
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.util.Iterator;
-
-import org.apache.lucene.facet.taxonomy.CategoryPath;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * This is a very efficient LabelToOrdinal implementation that uses a
- * CharBlockArray to store all labels and a configurable number of HashArrays to
- * reference the labels.
- * <p>
- * Since the HashArrays don't handle collisions, a {@link CollisionMap} is used
- * to store the colliding labels.
- * <p>
- * This data structure grows by adding a new HashArray whenever the number of
- * collisions in the {@link CollisionMap} exceeds {@code loadFactor} *
- * {@link #getMaxOrdinal()}. Growing also includes reinserting all colliding
- * labels into the HashArrays to possibly reduce the number of collisions.
- *
- * For setting the {@code loadFactor} see
- * {@link #CompactLabelToOrdinal(int, float, int)}.
- *
- * <p>
- * This data structure has a much lower memory footprint (~30%) compared to a
- * Java HashMap<String, Integer>. It also only uses a small fraction of objects
- * a HashMap would use, thus limiting the GC overhead. Ingestion speed was also
- * ~50% faster compared to a HashMap for 3M unique labels.
- *
- * @lucene.experimental
- */
-public class CompactLabelToOrdinal extends LabelToOrdinal {
-
- public static final float DefaultLoadFactor = 0.15f;
-
- static final char TerminatorChar = 0xffff;
- private static final int Collision = -5;
-
- private HashArray[] hashArrays;
- private CollisionMap collisionMap;
- private CharBlockArray labelRepository;
-
- private int capacity;
- private int threshold;
- private float loadFactor;
-
- public int sizeOfMap() {
- return this.collisionMap.size();
- }
-
- private CompactLabelToOrdinal() {
- }
-
- public CompactLabelToOrdinal(int initialCapacity, float loadFactor,
- int numHashArrays) {
-
- this.hashArrays = new HashArray[numHashArrays];
-
- this.capacity = determineCapacity((int) Math.pow(2, numHashArrays),
- initialCapacity);
- init();
- this.collisionMap = new CollisionMap(this.labelRepository);
-
- this.counter = 0;
- this.loadFactor = loadFactor;
-
- this.threshold = (int) (this.loadFactor * this.capacity);
- }
-
- static int determineCapacity(int minCapacity, int initialCapacity) {
- int capacity = minCapacity;
- while (capacity < initialCapacity) {
- capacity <<= 1;
- }
- return capacity;
- }
-
- private void init() {
- labelRepository = new CharBlockArray();
- try {
- new CategoryPath().serializeAppendTo(labelRepository);
- } catch (IOException e) { } //can't happen
-
- int c = this.capacity;
- for (int i = 0; i < this.hashArrays.length; i++) {
- this.hashArrays[i] = new HashArray(c);
- c /= 2;
- }
- }
-
- @Override
- public void addLabel(CategoryPath label, int ordinal) {
- if (this.collisionMap.size() > this.threshold) {
- grow();
- }
-
- int hash = CompactLabelToOrdinal.stringHashCode(label);
- for (int i = 0; i < this.hashArrays.length; i++) {
- if (addLabel(this.hashArrays[i], label, hash, ordinal)) {
- return;
- }
- }
-
- int prevVal = this.collisionMap.addLabel(label, hash, ordinal);
- if (prevVal != ordinal) {
- throw new IllegalArgumentException("Label already exists: " +
- label.toString('/') + " prev ordinal " + prevVal);
- }
- }
-
- @Override
- public void addLabel(CategoryPath label, int prefixLen, int ordinal) {
- if (this.collisionMap.size() > this.threshold) {
- grow();
- }
-
- int hash = CompactLabelToOrdinal.stringHashCode(label, prefixLen);
- for (int i = 0; i < this.hashArrays.length; i++) {
- if (addLabel(this.hashArrays[i], label, prefixLen, hash, ordinal)) {
- return;
- }
- }
-
- int prevVal = this.collisionMap.addLabel(label, prefixLen, hash, ordinal);
- if (prevVal != ordinal) {
- throw new IllegalArgumentException("Label already exists: " +
- label.toString('/', prefixLen) + " prev ordinal " + prevVal);
- }
- }
-
- @Override
- public int getOrdinal(CategoryPath label) {
- if (label == null) {
- return LabelToOrdinal.InvalidOrdinal;
- }
-
- int hash = CompactLabelToOrdinal.stringHashCode(label);
- for (int i = 0; i < this.hashArrays.length; i++) {
- int ord = getOrdinal(this.hashArrays[i], label, hash);
- if (ord != Collision) {
- return ord;
- }
- }
-
- return this.collisionMap.get(label, hash);
- }
-
- @Override
- public int getOrdinal(CategoryPath label, int prefixLen) {
- if (label == null) {
- return LabelToOrdinal.InvalidOrdinal;
- }
-
- int hash = CompactLabelToOrdinal.stringHashCode(label, prefixLen);
- for (int i = 0; i < this.hashArrays.length; i++) {
- int ord = getOrdinal(this.hashArrays[i], label, prefixLen, hash);
- if (ord != Collision) {
- return ord;
- }
- }
-
- return this.collisionMap.get(label, prefixLen, hash);
- }
-
- private void grow() {
- HashArray temp = this.hashArrays[this.hashArrays.length - 1];
-
- for (int i = this.hashArrays.length - 1; i > 0; i--) {
- this.hashArrays[i] = this.hashArrays[i - 1];
- }
-
- this.capacity *= 2;
- this.hashArrays[0] = new HashArray(this.capacity);
-
- for (int i = 1; i < this.hashArrays.length; i++) {
- int[] sourceOffsetArray = this.hashArrays[i].offsets;
- int[] sourceCidsArray = this.hashArrays[i].cids;
-
- for (int k = 0; k < sourceOffsetArray.length; k++) {
-
- for (int j = 0; j < i && sourceOffsetArray[k] != 0; j++) {
- int[] targetOffsetArray = this.hashArrays[j].offsets;
- int[] targetCidsArray = this.hashArrays[j].cids;
-
- int newIndex = indexFor(stringHashCode(
- this.labelRepository, sourceOffsetArray[k]),
- targetOffsetArray.length);
- if (targetOffsetArray[newIndex] == 0) {
- targetOffsetArray[newIndex] = sourceOffsetArray[k];
- targetCidsArray[newIndex] = sourceCidsArray[k];
- sourceOffsetArray[k] = 0;
- }
- }
- }
- }
-
- for (int i = 0; i < temp.offsets.length; i++) {
- int offset = temp.offsets[i];
- if (offset > 0) {
- int hash = stringHashCode(this.labelRepository, offset);
- addLabelOffset(hash, temp.cids[i], offset);
- }
- }
-
- CollisionMap oldCollisionMap = this.collisionMap;
- this.collisionMap = new CollisionMap(oldCollisionMap.capacity(),
- this.labelRepository);
- this.threshold = (int) (this.capacity * this.loadFactor);
-
- Iterator<CollisionMap.Entry> it = oldCollisionMap.entryIterator();
- while (it.hasNext()) {
- CollisionMap.Entry e = it.next();
- addLabelOffset(stringHashCode(this.labelRepository, e.offset),
- e.cid, e.offset);
- }
- }
-
- private boolean addLabel(HashArray a, CategoryPath label, int hash,
- int ordinal) {
- int index = CompactLabelToOrdinal.indexFor(hash, a.offsets.length);
- int offset = a.offsets[index];
-
- if (offset == 0) {
- a.offsets[index] = this.labelRepository.length();
- try {
- label.serializeAppendTo(this.labelRepository);
- } catch (IOException e) {
- // can't happen - LabelRepository.append() never throws an
- // exception
- }
- a.cids[index] = ordinal;
- return true;
- }
-
- return false;
- }
-
- private boolean addLabel(HashArray a, CategoryPath label, int prefixLen,
- int hash, int ordinal) {
- int index = CompactLabelToOrdinal.indexFor(hash, a.offsets.length);
- int offset = a.offsets[index];
-
- if (offset == 0) {
- a.offsets[index] = this.labelRepository.length();
- try {
- label.serializeAppendTo(prefixLen, this.labelRepository);
- } catch (IOException e) {
- // can't happen - LabelRepository.append() never throws an
- // exception
- }
- a.cids[index] = ordinal;
- return true;
- }
-
- return false;
- }
-
- private void addLabelOffset(int hash, int cid, int knownOffset) {
- for (int i = 0; i < this.hashArrays.length; i++) {
- if (addLabelOffsetToHashArray(this.hashArrays[i], hash, cid,
- knownOffset)) {
- return;
- }
- }
-
- this.collisionMap.addLabelOffset(hash, knownOffset, cid);
-
- if (this.collisionMap.size() > this.threshold) {
- grow();
- }
- }
-
- private boolean addLabelOffsetToHashArray(HashArray a, int hash, int ordinal,
- int knownOffset) {
-
- int index = CompactLabelToOrdinal.indexFor(hash, a.offsets.length);
- int offset = a.offsets[index];
-
- if (offset == 0) {
- a.offsets[index] = knownOffset;
- a.cids[index] = ordinal;
- return true;
- }
-
- return false;
- }
-
- private int getOrdinal(HashArray a, CategoryPath label, int hash) {
- if (label == null) {
- return LabelToOrdinal.InvalidOrdinal;
- }
-
- int index = CompactLabelToOrdinal.indexFor(hash, a.offsets.length);
- int offset = a.offsets[index];
- if (offset == 0) {
- return LabelToOrdinal.InvalidOrdinal;
- }
-
- if (label.equalsToSerialized(labelRepository, offset)) {
- return a.cids[index];
- }
-
- return Collision;
- }
-
- private int getOrdinal(HashArray a, CategoryPath label, int prefixLen, int hash) {
- if (label == null) {
- return LabelToOrdinal.InvalidOrdinal;
- }
-
- int index = CompactLabelToOrdinal.indexFor(hash, a.offsets.length);
- int offset = a.offsets[index];
- if (offset == 0) {
- return LabelToOrdinal.InvalidOrdinal;
- }
-
- if (label.equalsToSerialized(prefixLen, labelRepository, offset)) {
- return a.cids[index];
- }
-
- return Collision;
- }
-
- /**
- * Returns index for hash code h.
- */
- static int indexFor(int h, int length) {
- return h & (length - 1);
- }
-
- // static int stringHashCode(String label) {
- // int len = label.length();
- // int hash = 0;
- // int i;
- // for (i = 0; i < len; ++i)
- // hash = 33 * hash + label.charAt(i);
- //
- // hash = hash ^ ((hash >>> 20) ^ (hash >>> 12));
- // hash = hash ^ (hash >>> 7) ^ (hash >>> 4);
- //
- // return hash;
- //
- // }
-
- static int stringHashCode(CategoryPath label) {
- int hash = label.hashCode();
-
- hash = hash ^ ((hash >>> 20) ^ (hash >>> 12));
- hash = hash ^ (hash >>> 7) ^ (hash >>> 4);
-
- return hash;
-
- }
-
- static int stringHashCode(CategoryPath label, int prefixLen) {
- int hash = label.hashCode(prefixLen);
-
- hash = hash ^ ((hash >>> 20) ^ (hash >>> 12));
- hash = hash ^ (hash >>> 7) ^ (hash >>> 4);
-
- return hash;
-
- }
-
- static int stringHashCode(CharBlockArray labelRepository, int offset) {
- int hash = CategoryPath.hashCodeOfSerialized(labelRepository, offset);
-
- hash = hash ^ ((hash >>> 20) ^ (hash >>> 12));
- hash = hash ^ (hash >>> 7) ^ (hash >>> 4);
-
- return hash;
- }
-
- // public static boolean equals(CharSequence label, CharBlockArray array,
- // int offset) {
- // // CONTINUE HERE
- // int len = label.length();
- // int bi = array.blockIndex(offset);
- // CharBlockArray.Block b = array.blocks.get(bi);
- // int index = array.indexInBlock(offset);
- //
- // for (int i = 0; i < len; i++) {
- // if (label.charAt(i) != b.chars[index]) {
- // return false;
- // }
- // index++;
- // if (index == b.length) {
- // b = array.blocks.get(++bi);
- // index = 0;
- // }
- // }
- //
- // return b.chars[index] == TerminatorChar;
- // }
-
- /**
- * Returns an estimate of the amount of memory used by this table. Called only in
- * this package. Memory is consumed mainly by three structures: the hash arrays,
- * label repository and collision map.
- */
- int getMemoryUsage() {
- int memoryUsage = 0;
- if (this.hashArrays != null) {
- // HashArray capacity is instance-specific.
- for (HashArray ha : this.hashArrays) {
- // Each has 2 capacity-length arrays of ints.
- memoryUsage += ( ha.capacity * 2 * 4 ) + 4;
- }
- }
- if (this.labelRepository != null) {
- // All blocks are the same size.
- int blockSize = this.labelRepository.blockSize;
- // Each block has room for blockSize UTF-16 chars.
- int actualBlockSize = ( blockSize * 2 ) + 4;
- memoryUsage += this.labelRepository.blocks.size() * actualBlockSize;
- memoryUsage += 8; // Two int values for array as a whole.
- }
- if (this.collisionMap != null) {
- memoryUsage += this.collisionMap.getMemoryUsage();
- }
- return memoryUsage;
- }
-
- /**
- * Opens the file and reloads the CompactLabelToOrdinal. The file it expects
- * is generated from the {@link #flush()} command.
- */
- static CompactLabelToOrdinal open(File file, float loadFactor,
- int numHashArrays) throws IOException {
- /**
- * Part of the file is the labelRepository, which needs to be rehashed
- * and label offsets re-added to the object. I am unsure as to why we
- * can't just store these off in the file as well, but in keeping with
- * the spirit of the original code, I did it this way. (ssuppe)
- */
- CompactLabelToOrdinal l2o = new CompactLabelToOrdinal();
- l2o.loadFactor = loadFactor;
- l2o.hashArrays = new HashArray[numHashArrays];
-
- DataInputStream dis = null;
- try {
- dis = new DataInputStream(new BufferedInputStream(
- new FileInputStream(file)));
-
- // TaxiReader needs to load the "counter" or occupancy (L2O) to know
- // the next unique facet. we used to load the delimiter too, but
- // never used it.
- l2o.counter = dis.readInt();
-
- l2o.capacity = determineCapacity((int) Math.pow(2,
- l2o.hashArrays.length), l2o.counter);
- l2o.init();
-
- // now read the chars
- l2o.labelRepository = CharBlockArray.open(dis);
-
- l2o.collisionMap = new CollisionMap(l2o.labelRepository);
-
- // Calculate hash on the fly based on how CategoryPath hashes
- // itself. Maybe in the future we can call some static based methods
- // in CategoryPath so that this doesn't break again? I don't like
- // having code in two different places...
- int cid = 0;
- // Skip the initial offset, it's the CategoryPath(0,0), which isn't
- // a hashed value.
- int offset = 1;
- int lastStartOffset = offset;
- // This loop really relies on a well-formed input (assumes pretty blindly
- // that array offsets will work). Since the initial file is machine
- // generated, I think this should be OK.
- while (offset < l2o.labelRepository.length()) {
- // First component is numcomponents, so we initialize the hash
- // to this
- int ncomponents = l2o.labelRepository.charAt(offset++);
- int hash = ncomponents;
- // If ncomponents is 0, then we are done?
- if (ncomponents != 0) {
-
- // usedchars is always the last member of the 'ends' array
- // in serialization. Rather than rebuild the entire array,
- // assign usedchars to the last value we read in. This will
- // be slightly more memory efficient.
- int usedchars = 0;
- for (int i = 0; i < ncomponents; i++) {
- usedchars = l2o.labelRepository.charAt(offset++);
- hash = hash * 31 + usedchars;
- }
- // Hash the usedchars for this label
- for (int i = 0; i < usedchars; i++) {
- hash = hash * 31 + l2o.labelRepository.charAt(offset++);
- }
- }
- // Now that we've hashed the components of the label, do the
- // final part of the hash algorithm.
- hash = hash ^ ((hash >>> 20) ^ (hash >>> 12));
- hash = hash ^ (hash >>> 7) ^ (hash >>> 4);
- // Add the label, and let's keep going
- l2o.addLabelOffset(hash, cid, lastStartOffset);
- cid++;
- lastStartOffset = offset;
- }
-
- } catch (ClassNotFoundException cnfe) {
- throw new IOException("Invalid file format. Cannot deserialize.");
- } finally {
- if (dis != null) {
- dis.close();
- }
- }
-
- l2o.threshold = (int) (l2o.loadFactor * l2o.capacity);
- return l2o;
-
- }
-
- void flush(File file) throws IOException {
- FileOutputStream fos = new FileOutputStream(file);
-
- try {
- BufferedOutputStream os = new BufferedOutputStream(fos);
-
- DataOutputStream dos = new DataOutputStream(os);
- dos.writeInt(this.counter);
-
- // write the labelRepository
- this.labelRepository.flush(dos);
-
- // Closes the data output stream
- dos.close();
-
- } finally {
- fos.close();
- }
- }
-
- private static final class HashArray {
- int[] offsets;
- int[] cids;
-
- int capacity;
-
- HashArray(int c) {
- this.capacity = c;
- this.offsets = new int[this.capacity];
- this.cids = new int[this.capacity];
- }
- }
-}