1 package org.apache.lucene.facet.taxonomy.writercache.cl2o;
3 import java.io.BufferedInputStream;
4 import java.io.BufferedOutputStream;
5 import java.io.DataInputStream;
6 import java.io.DataOutputStream;
8 import java.io.FileInputStream;
9 import java.io.FileOutputStream;
10 import java.io.IOException;
11 import java.util.Iterator;
13 import org.apache.lucene.facet.taxonomy.CategoryPath;
16 * Licensed to the Apache Software Foundation (ASF) under one or more
17 * contributor license agreements. See the NOTICE file distributed with
18 * this work for additional information regarding copyright ownership.
19 * The ASF licenses this file to You under the Apache License, Version 2.0
20 * (the "License"); you may not use this file except in compliance with
21 * the License. You may obtain a copy of the License at
23 * http://www.apache.org/licenses/LICENSE-2.0
25 * Unless required by applicable law or agreed to in writing, software
26 * distributed under the License is distributed on an "AS IS" BASIS,
27 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
28 * See the License for the specific language governing permissions and
29 * limitations under the License.
33 * This is a very efficient LabelToOrdinal implementation that uses a
34 * CharBlockArray to store all labels and a configurable number of HashArrays to
35 * reference the labels.
37 * Since the HashArrays don't handle collisions, a {@link CollisionMap} is used
38 * to store the colliding labels.
40 * This data structure grows by adding a new HashArray whenever the number of
41 * collisions in the {@link CollisionMap} exceeds {@code loadFactor} *
42 * {@link #getMaxOrdinal()}. Growing also includes reinserting all colliding
43 * labels into the HashArrays to possibly reduce the number of collisions.
45 * For setting the {@code loadFactor} see
46 * {@link #CompactLabelToOrdinal(int, float, int)}.
49 * This data structure has a much lower memory footprint (~30%) compared to a
50 * Java HashMap<String, Integer>. It also only uses a small fraction of objects
51 * a HashMap would use, thus limiting the GC overhead. Ingestion speed was also
52 * ~50% faster compared to a HashMap for 3M unique labels.
54 * @lucene.experimental
56 public class CompactLabelToOrdinal extends LabelToOrdinal {
58 public static final float DefaultLoadFactor = 0.15f;
60 static final char TerminatorChar = 0xffff;
61 private static final int Collision = -5;
63 private HashArray[] hashArrays;
64 private CollisionMap collisionMap;
65 private CharBlockArray labelRepository;
68 private int threshold;
69 private float loadFactor;
71 public int sizeOfMap() {
72 return this.collisionMap.size();
75 private CompactLabelToOrdinal() {
78 public CompactLabelToOrdinal(int initialCapacity, float loadFactor,
81 this.hashArrays = new HashArray[numHashArrays];
83 this.capacity = determineCapacity((int) Math.pow(2, numHashArrays),
86 this.collisionMap = new CollisionMap(this.labelRepository);
89 this.loadFactor = loadFactor;
91 this.threshold = (int) (this.loadFactor * this.capacity);
94 static int determineCapacity(int minCapacity, int initialCapacity) {
95 int capacity = minCapacity;
96 while (capacity < initialCapacity) {
102 private void init() {
103 labelRepository = new CharBlockArray();
105 new CategoryPath().serializeAppendTo(labelRepository);
106 } catch (IOException e) { } //can't happen
108 int c = this.capacity;
109 for (int i = 0; i < this.hashArrays.length; i++) {
110 this.hashArrays[i] = new HashArray(c);
116 public void addLabel(CategoryPath label, int ordinal) {
117 if (this.collisionMap.size() > this.threshold) {
121 int hash = CompactLabelToOrdinal.stringHashCode(label);
122 for (int i = 0; i < this.hashArrays.length; i++) {
123 if (addLabel(this.hashArrays[i], label, hash, ordinal)) {
128 int prevVal = this.collisionMap.addLabel(label, hash, ordinal);
129 if (prevVal != ordinal) {
130 throw new IllegalArgumentException("Label already exists: " +
131 label.toString('/') + " prev ordinal " + prevVal);
136 public void addLabel(CategoryPath label, int prefixLen, int ordinal) {
137 if (this.collisionMap.size() > this.threshold) {
141 int hash = CompactLabelToOrdinal.stringHashCode(label, prefixLen);
142 for (int i = 0; i < this.hashArrays.length; i++) {
143 if (addLabel(this.hashArrays[i], label, prefixLen, hash, ordinal)) {
148 int prevVal = this.collisionMap.addLabel(label, prefixLen, hash, ordinal);
149 if (prevVal != ordinal) {
150 throw new IllegalArgumentException("Label already exists: " +
151 label.toString('/', prefixLen) + " prev ordinal " + prevVal);
156 public int getOrdinal(CategoryPath label) {
158 return LabelToOrdinal.InvalidOrdinal;
161 int hash = CompactLabelToOrdinal.stringHashCode(label);
162 for (int i = 0; i < this.hashArrays.length; i++) {
163 int ord = getOrdinal(this.hashArrays[i], label, hash);
164 if (ord != Collision) {
169 return this.collisionMap.get(label, hash);
173 public int getOrdinal(CategoryPath label, int prefixLen) {
175 return LabelToOrdinal.InvalidOrdinal;
178 int hash = CompactLabelToOrdinal.stringHashCode(label, prefixLen);
179 for (int i = 0; i < this.hashArrays.length; i++) {
180 int ord = getOrdinal(this.hashArrays[i], label, prefixLen, hash);
181 if (ord != Collision) {
186 return this.collisionMap.get(label, prefixLen, hash);
189 private void grow() {
190 HashArray temp = this.hashArrays[this.hashArrays.length - 1];
192 for (int i = this.hashArrays.length - 1; i > 0; i--) {
193 this.hashArrays[i] = this.hashArrays[i - 1];
197 this.hashArrays[0] = new HashArray(this.capacity);
199 for (int i = 1; i < this.hashArrays.length; i++) {
200 int[] sourceOffsetArray = this.hashArrays[i].offsets;
201 int[] sourceCidsArray = this.hashArrays[i].cids;
203 for (int k = 0; k < sourceOffsetArray.length; k++) {
205 for (int j = 0; j < i && sourceOffsetArray[k] != 0; j++) {
206 int[] targetOffsetArray = this.hashArrays[j].offsets;
207 int[] targetCidsArray = this.hashArrays[j].cids;
209 int newIndex = indexFor(stringHashCode(
210 this.labelRepository, sourceOffsetArray[k]),
211 targetOffsetArray.length);
212 if (targetOffsetArray[newIndex] == 0) {
213 targetOffsetArray[newIndex] = sourceOffsetArray[k];
214 targetCidsArray[newIndex] = sourceCidsArray[k];
215 sourceOffsetArray[k] = 0;
221 for (int i = 0; i < temp.offsets.length; i++) {
222 int offset = temp.offsets[i];
224 int hash = stringHashCode(this.labelRepository, offset);
225 addLabelOffset(hash, temp.cids[i], offset);
229 CollisionMap oldCollisionMap = this.collisionMap;
230 this.collisionMap = new CollisionMap(oldCollisionMap.capacity(),
231 this.labelRepository);
232 this.threshold = (int) (this.capacity * this.loadFactor);
234 Iterator<CollisionMap.Entry> it = oldCollisionMap.entryIterator();
235 while (it.hasNext()) {
236 CollisionMap.Entry e = it.next();
237 addLabelOffset(stringHashCode(this.labelRepository, e.offset),
242 private boolean addLabel(HashArray a, CategoryPath label, int hash,
244 int index = CompactLabelToOrdinal.indexFor(hash, a.offsets.length);
245 int offset = a.offsets[index];
248 a.offsets[index] = this.labelRepository.length();
250 label.serializeAppendTo(this.labelRepository);
251 } catch (IOException e) {
252 // can't happen - LabelRepository.append() never throws an
255 a.cids[index] = ordinal;
262 private boolean addLabel(HashArray a, CategoryPath label, int prefixLen,
263 int hash, int ordinal) {
264 int index = CompactLabelToOrdinal.indexFor(hash, a.offsets.length);
265 int offset = a.offsets[index];
268 a.offsets[index] = this.labelRepository.length();
270 label.serializeAppendTo(prefixLen, this.labelRepository);
271 } catch (IOException e) {
272 // can't happen - LabelRepository.append() never throws an
275 a.cids[index] = ordinal;
282 private void addLabelOffset(int hash, int cid, int knownOffset) {
283 for (int i = 0; i < this.hashArrays.length; i++) {
284 if (addLabelOffsetToHashArray(this.hashArrays[i], hash, cid,
290 this.collisionMap.addLabelOffset(hash, knownOffset, cid);
292 if (this.collisionMap.size() > this.threshold) {
297 private boolean addLabelOffsetToHashArray(HashArray a, int hash, int ordinal,
300 int index = CompactLabelToOrdinal.indexFor(hash, a.offsets.length);
301 int offset = a.offsets[index];
304 a.offsets[index] = knownOffset;
305 a.cids[index] = ordinal;
312 private int getOrdinal(HashArray a, CategoryPath label, int hash) {
314 return LabelToOrdinal.InvalidOrdinal;
317 int index = CompactLabelToOrdinal.indexFor(hash, a.offsets.length);
318 int offset = a.offsets[index];
320 return LabelToOrdinal.InvalidOrdinal;
323 if (label.equalsToSerialized(labelRepository, offset)) {
324 return a.cids[index];
330 private int getOrdinal(HashArray a, CategoryPath label, int prefixLen, int hash) {
332 return LabelToOrdinal.InvalidOrdinal;
335 int index = CompactLabelToOrdinal.indexFor(hash, a.offsets.length);
336 int offset = a.offsets[index];
338 return LabelToOrdinal.InvalidOrdinal;
341 if (label.equalsToSerialized(prefixLen, labelRepository, offset)) {
342 return a.cids[index];
349 * Returns index for hash code h.
351 static int indexFor(int h, int length) {
352 return h & (length - 1);
355 // static int stringHashCode(String label) {
356 // int len = label.length();
359 // for (i = 0; i < len; ++i)
360 // hash = 33 * hash + label.charAt(i);
362 // hash = hash ^ ((hash >>> 20) ^ (hash >>> 12));
363 // hash = hash ^ (hash >>> 7) ^ (hash >>> 4);
369 static int stringHashCode(CategoryPath label) {
370 int hash = label.hashCode();
372 hash = hash ^ ((hash >>> 20) ^ (hash >>> 12));
373 hash = hash ^ (hash >>> 7) ^ (hash >>> 4);
379 static int stringHashCode(CategoryPath label, int prefixLen) {
380 int hash = label.hashCode(prefixLen);
382 hash = hash ^ ((hash >>> 20) ^ (hash >>> 12));
383 hash = hash ^ (hash >>> 7) ^ (hash >>> 4);
389 static int stringHashCode(CharBlockArray labelRepository, int offset) {
390 int hash = CategoryPath.hashCodeOfSerialized(labelRepository, offset);
392 hash = hash ^ ((hash >>> 20) ^ (hash >>> 12));
393 hash = hash ^ (hash >>> 7) ^ (hash >>> 4);
398 // public static boolean equals(CharSequence label, CharBlockArray array,
401 // int len = label.length();
402 // int bi = array.blockIndex(offset);
403 // CharBlockArray.Block b = array.blocks.get(bi);
404 // int index = array.indexInBlock(offset);
406 // for (int i = 0; i < len; i++) {
407 // if (label.charAt(i) != b.chars[index]) {
411 // if (index == b.length) {
412 // b = array.blocks.get(++bi);
417 // return b.chars[index] == TerminatorChar;
421 * Returns an estimate of the amount of memory used by this table. Called only in
422 * this package. Memory is consumed mainly by three structures: the hash arrays,
423 * label repository and collision map.
425 int getMemoryUsage() {
427 if (this.hashArrays != null) {
428 // HashArray capacity is instance-specific.
429 for (HashArray ha : this.hashArrays) {
430 // Each has 2 capacity-length arrays of ints.
431 memoryUsage += ( ha.capacity * 2 * 4 ) + 4;
434 if (this.labelRepository != null) {
435 // All blocks are the same size.
436 int blockSize = this.labelRepository.blockSize;
437 // Each block has room for blockSize UTF-16 chars.
438 int actualBlockSize = ( blockSize * 2 ) + 4;
439 memoryUsage += this.labelRepository.blocks.size() * actualBlockSize;
440 memoryUsage += 8; // Two int values for array as a whole.
442 if (this.collisionMap != null) {
443 memoryUsage += this.collisionMap.getMemoryUsage();
449 * Opens the file and reloads the CompactLabelToOrdinal. The file it expects
450 * is generated from the {@link #flush()} command.
452 static CompactLabelToOrdinal open(File file, float loadFactor,
453 int numHashArrays) throws IOException {
455 * Part of the file is the labelRepository, which needs to be rehashed
456 * and label offsets re-added to the object. I am unsure as to why we
457 * can't just store these off in the file as well, but in keeping with
458 * the spirit of the original code, I did it this way. (ssuppe)
460 CompactLabelToOrdinal l2o = new CompactLabelToOrdinal();
461 l2o.loadFactor = loadFactor;
462 l2o.hashArrays = new HashArray[numHashArrays];
464 DataInputStream dis = null;
466 dis = new DataInputStream(new BufferedInputStream(
467 new FileInputStream(file)));
469 // TaxiReader needs to load the "counter" or occupancy (L2O) to know
470 // the next unique facet. we used to load the delimiter too, but
472 l2o.counter = dis.readInt();
474 l2o.capacity = determineCapacity((int) Math.pow(2,
475 l2o.hashArrays.length), l2o.counter);
478 // now read the chars
479 l2o.labelRepository = CharBlockArray.open(dis);
481 l2o.collisionMap = new CollisionMap(l2o.labelRepository);
483 // Calculate hash on the fly based on how CategoryPath hashes
484 // itself. Maybe in the future we can call some static based methods
485 // in CategoryPath so that this doesn't break again? I don't like
486 // having code in two different places...
488 // Skip the initial offset, it's the CategoryPath(0,0), which isn't
491 int lastStartOffset = offset;
492 // This loop really relies on a well-formed input (assumes pretty blindly
493 // that array offsets will work). Since the initial file is machine
494 // generated, I think this should be OK.
495 while (offset < l2o.labelRepository.length()) {
496 // First component is numcomponents, so we initialize the hash
498 int ncomponents = l2o.labelRepository.charAt(offset++);
499 int hash = ncomponents;
500 // If ncomponents is 0, then we are done?
501 if (ncomponents != 0) {
503 // usedchars is always the last member of the 'ends' array
504 // in serialization. Rather than rebuild the entire array,
505 // assign usedchars to the last value we read in. This will
506 // be slightly more memory efficient.
508 for (int i = 0; i < ncomponents; i++) {
509 usedchars = l2o.labelRepository.charAt(offset++);
510 hash = hash * 31 + usedchars;
512 // Hash the usedchars for this label
513 for (int i = 0; i < usedchars; i++) {
514 hash = hash * 31 + l2o.labelRepository.charAt(offset++);
517 // Now that we've hashed the components of the label, do the
518 // final part of the hash algorithm.
519 hash = hash ^ ((hash >>> 20) ^ (hash >>> 12));
520 hash = hash ^ (hash >>> 7) ^ (hash >>> 4);
521 // Add the label, and let's keep going
522 l2o.addLabelOffset(hash, cid, lastStartOffset);
524 lastStartOffset = offset;
527 } catch (ClassNotFoundException cnfe) {
528 throw new IOException("Invalid file format. Cannot deserialize.");
535 l2o.threshold = (int) (l2o.loadFactor * l2o.capacity);
540 void flush(File file) throws IOException {
541 FileOutputStream fos = new FileOutputStream(file);
544 BufferedOutputStream os = new BufferedOutputStream(fos);
546 DataOutputStream dos = new DataOutputStream(os);
547 dos.writeInt(this.counter);
549 // write the labelRepository
550 this.labelRepository.flush(dos);
552 // Closes the data output stream
560 private static final class HashArray {
568 this.offsets = new int[this.capacity];
569 this.cids = new int[this.capacity];