lucene-java-3.4.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CompactLabelToOrdinal.java

   1 package org.apache.lucene.facet.taxonomy.writercache.cl2o;
   2
   3 import java.io.BufferedInputStream;
   4 import java.io.BufferedOutputStream;
   5 import java.io.DataInputStream;
   6 import java.io.DataOutputStream;
   7 import java.io.File;
   8 import java.io.FileInputStream;
   9 import java.io.FileOutputStream;
  10 import java.io.IOException;
  11 import java.util.Iterator;
  12
  13 import org.apache.lucene.facet.taxonomy.CategoryPath;
  14
  15 /**
  16  * Licensed to the Apache Software Foundation (ASF) under one or more
  17  * contributor license agreements.  See the NOTICE file distributed with
  18  * this work for additional information regarding copyright ownership.
  19  * The ASF licenses this file to You under the Apache License, Version 2.0
  20  * (the "License"); you may not use this file except in compliance with
  21  * the License.  You may obtain a copy of the License at
  22  *
  23  *     http://www.apache.org/licenses/LICENSE-2.0
  24  *
  25  * Unless required by applicable law or agreed to in writing, software
  26  * distributed under the License is distributed on an "AS IS" BASIS,
  27  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  28  * See the License for the specific language governing permissions and
  29  * limitations under the License.
  30  */
  31
  32 /**
  33  * This is a very efficient LabelToOrdinal implementation that uses a
  34  * CharBlockArray to store all labels and a configurable number of HashArrays to
  35  * reference the labels.
  36  * <p>
  37  * Since the HashArrays don't handle collisions, a {@link CollisionMap} is used
  38  * to store the colliding labels.
  39  * <p>
  40  * This data structure grows by adding a new HashArray whenever the number of
  41  * collisions in the {@link CollisionMap} exceeds {@code loadFactor} *
  42  * {@link #getMaxOrdinal()}. Growing also includes reinserting all colliding
  43  * labels into the HashArrays to possibly reduce the number of collisions.
  44  *
  45  * For setting the {@code loadFactor} see
  46  * {@link #CompactLabelToOrdinal(int, float, int)}.
  47  *
  48  * <p>
  49  * This data structure has a much lower memory footprint (~30%) compared to a
  50  * Java HashMap<String, Integer>. It also only uses a small fraction of objects
  51  * a HashMap would use, thus limiting the GC overhead. Ingestion speed was also
  52  * ~50% faster compared to a HashMap for 3M unique labels.
  53  *
  54  * @lucene.experimental
  55  */
  56 public class CompactLabelToOrdinal extends LabelToOrdinal {
  57
  58   public static final float DefaultLoadFactor = 0.15f;
  59
  60   static final char TerminatorChar = 0xffff;
  61   private static final int Collision = -5;
  62
  63   private HashArray[] hashArrays;
  64   private CollisionMap collisionMap;
  65   private CharBlockArray labelRepository;
  66
  67   private int capacity;
  68   private int threshold;
  69   private float loadFactor;
  70
  71   public int sizeOfMap() {
  72     return this.collisionMap.size();
  73   }
  74
  75   private CompactLabelToOrdinal() {
  76   }
  77
  78   public CompactLabelToOrdinal(int initialCapacity, float loadFactor,
  79                                 int numHashArrays) {
  80
  81     this.hashArrays = new HashArray[numHashArrays];
  82
  83     this.capacity = determineCapacity((int) Math.pow(2, numHashArrays),
  84         initialCapacity);
  85     init();
  86     this.collisionMap = new CollisionMap(this.labelRepository);
  87
  88     this.counter = 0;
  89     this.loadFactor = loadFactor;
  90
  91     this.threshold = (int) (this.loadFactor * this.capacity);
  92   }
  93
  94   static int determineCapacity(int minCapacity, int initialCapacity) {
  95     int capacity = minCapacity;
  96     while (capacity < initialCapacity) {
  97       capacity <<= 1;
  98     }
  99     return capacity;
 100   }
 101
 102   private void init() {
 103     labelRepository = new CharBlockArray();
 104     try {
 105       new CategoryPath().serializeAppendTo(labelRepository);
 106     } catch (IOException e) { }  //can't happen
 107
 108     int c = this.capacity;
 109     for (int i = 0; i < this.hashArrays.length; i++) {
 110       this.hashArrays[i] = new HashArray(c);
 111       c /= 2;
 112     }
 113   }
 114
 115   @Override
 116   public void addLabel(CategoryPath label, int ordinal) {
 117     if (this.collisionMap.size() > this.threshold) {
 118       grow();
 119     }
 120
 121     int hash = CompactLabelToOrdinal.stringHashCode(label);
 122     for (int i = 0; i < this.hashArrays.length; i++) {
 123       if (addLabel(this.hashArrays[i], label, hash, ordinal)) {
 124         return;
 125       }
 126     }
 127
 128     int prevVal = this.collisionMap.addLabel(label, hash, ordinal);
 129     if (prevVal != ordinal) {
 130       throw new IllegalArgumentException("Label already exists: " +
 131           label.toString('/') + " prev ordinal " + prevVal);
 132     }
 133   }
 134
 135   @Override
 136   public void addLabel(CategoryPath label, int prefixLen, int ordinal) {
 137     if (this.collisionMap.size() > this.threshold) {
 138       grow();
 139     }
 140
 141     int hash = CompactLabelToOrdinal.stringHashCode(label, prefixLen);
 142     for (int i = 0; i < this.hashArrays.length; i++) {
 143       if (addLabel(this.hashArrays[i], label, prefixLen, hash, ordinal)) {
 144         return;
 145       }
 146     }
 147
 148     int prevVal = this.collisionMap.addLabel(label, prefixLen, hash, ordinal);
 149     if (prevVal != ordinal) {
 150       throw new IllegalArgumentException("Label already exists: " +
 151           label.toString('/', prefixLen) + " prev ordinal " + prevVal);
 152     }
 153   }
 154
 155   @Override
 156   public int getOrdinal(CategoryPath label) {
 157     if (label == null) {
 158       return LabelToOrdinal.InvalidOrdinal;
 159     }
 160
 161     int hash = CompactLabelToOrdinal.stringHashCode(label);
 162     for (int i = 0; i < this.hashArrays.length; i++) {
 163       int ord = getOrdinal(this.hashArrays[i], label, hash);
 164       if (ord != Collision) {
 165         return ord;
 166       }
 167     }
 168
 169     return this.collisionMap.get(label, hash);
 170   }
 171
 172   @Override
 173   public int getOrdinal(CategoryPath label, int prefixLen) {
 174     if (label == null) {
 175       return LabelToOrdinal.InvalidOrdinal;
 176     }
 177
 178     int hash = CompactLabelToOrdinal.stringHashCode(label, prefixLen);
 179     for (int i = 0; i < this.hashArrays.length; i++) {
 180       int ord = getOrdinal(this.hashArrays[i], label, prefixLen, hash);
 181       if (ord != Collision) {
 182         return ord;
 183       }
 184     }
 185
 186     return this.collisionMap.get(label, prefixLen, hash);
 187   }
 188
 189   private void grow() {
 190     HashArray temp = this.hashArrays[this.hashArrays.length - 1];
 191
 192     for (int i = this.hashArrays.length - 1; i > 0; i--) {
 193       this.hashArrays[i] = this.hashArrays[i - 1];
 194     }
 195
 196     this.capacity *= 2;
 197     this.hashArrays[0] = new HashArray(this.capacity);
 198
 199     for (int i = 1; i < this.hashArrays.length; i++) {
 200       int[] sourceOffsetArray = this.hashArrays[i].offsets;
 201       int[] sourceCidsArray = this.hashArrays[i].cids;
 202
 203       for (int k = 0; k < sourceOffsetArray.length; k++) {
 204
 205         for (int j = 0; j < i && sourceOffsetArray[k] != 0; j++) {
 206           int[] targetOffsetArray = this.hashArrays[j].offsets;
 207           int[] targetCidsArray = this.hashArrays[j].cids;
 208
 209           int newIndex = indexFor(stringHashCode(
 210               this.labelRepository, sourceOffsetArray[k]),
 211               targetOffsetArray.length);
 212           if (targetOffsetArray[newIndex] == 0) {
 213             targetOffsetArray[newIndex] = sourceOffsetArray[k];
 214             targetCidsArray[newIndex] = sourceCidsArray[k];
 215             sourceOffsetArray[k] = 0;
 216           }
 217         }
 218       }
 219     }
 220
 221     for (int i = 0; i < temp.offsets.length; i++) {
 222       int offset = temp.offsets[i];
 223       if (offset > 0) {
 224         int hash = stringHashCode(this.labelRepository, offset);
 225         addLabelOffset(hash, temp.cids[i], offset);
 226       }
 227     }
 228
 229     CollisionMap oldCollisionMap = this.collisionMap;
 230     this.collisionMap = new CollisionMap(oldCollisionMap.capacity(),
 231         this.labelRepository);
 232     this.threshold = (int) (this.capacity * this.loadFactor);
 233
 234     Iterator<CollisionMap.Entry> it = oldCollisionMap.entryIterator();
 235     while (it.hasNext()) {
 236       CollisionMap.Entry e = it.next();
 237       addLabelOffset(stringHashCode(this.labelRepository, e.offset),
 238           e.cid, e.offset);
 239     }
 240   }
 241
 242   private boolean addLabel(HashArray a, CategoryPath label, int hash,
 243                             int ordinal) {
 244     int index = CompactLabelToOrdinal.indexFor(hash, a.offsets.length);
 245     int offset = a.offsets[index];
 246
 247     if (offset == 0) {
 248       a.offsets[index] = this.labelRepository.length();
 249       try {
 250         label.serializeAppendTo(this.labelRepository);
 251       } catch (IOException e) {
 252         // can't happen - LabelRepository.append() never throws an
 253         // exception
 254       }
 255       a.cids[index] = ordinal;
 256       return true;
 257     }
 258
 259     return false;
 260   }
 261
 262   private boolean addLabel(HashArray a, CategoryPath label, int prefixLen,
 263                             int hash, int ordinal) {
 264     int index = CompactLabelToOrdinal.indexFor(hash, a.offsets.length);
 265     int offset = a.offsets[index];
 266
 267     if (offset == 0) {
 268       a.offsets[index] = this.labelRepository.length();
 269       try {
 270         label.serializeAppendTo(prefixLen, this.labelRepository);
 271       } catch (IOException e) {
 272         // can't happen - LabelRepository.append() never throws an
 273         // exception
 274       }
 275       a.cids[index] = ordinal;
 276       return true;
 277     }
 278
 279     return false;
 280   }
 281
 282   private void addLabelOffset(int hash, int cid, int knownOffset) {
 283     for (int i = 0; i < this.hashArrays.length; i++) {
 284       if (addLabelOffsetToHashArray(this.hashArrays[i], hash, cid,
 285           knownOffset)) {
 286         return;
 287       }
 288     }
 289
 290     this.collisionMap.addLabelOffset(hash, knownOffset, cid);
 291
 292     if (this.collisionMap.size() > this.threshold) {
 293       grow();
 294     }
 295   }
 296
 297   private boolean addLabelOffsetToHashArray(HashArray a, int hash, int ordinal,
 298                                             int knownOffset) {
 299
 300     int index = CompactLabelToOrdinal.indexFor(hash, a.offsets.length);
 301     int offset = a.offsets[index];
 302
 303     if (offset == 0) {
 304       a.offsets[index] = knownOffset;
 305       a.cids[index] = ordinal;
 306       return true;
 307     }
 308
 309     return false;
 310   }
 311
 312   private int getOrdinal(HashArray a, CategoryPath label, int hash) {
 313     if (label == null) {
 314       return LabelToOrdinal.InvalidOrdinal;
 315     }
 316
 317     int index = CompactLabelToOrdinal.indexFor(hash, a.offsets.length);
 318     int offset = a.offsets[index];
 319     if (offset == 0) {
 320       return LabelToOrdinal.InvalidOrdinal;
 321     }
 322
 323     if (label.equalsToSerialized(labelRepository, offset)) {
 324       return a.cids[index];
 325     }
 326
 327     return Collision;
 328   }
 329
 330   private int getOrdinal(HashArray a, CategoryPath label, int prefixLen, int hash) {
 331     if (label == null) {
 332       return LabelToOrdinal.InvalidOrdinal;
 333     }
 334
 335     int index = CompactLabelToOrdinal.indexFor(hash, a.offsets.length);
 336     int offset = a.offsets[index];
 337     if (offset == 0) {
 338       return LabelToOrdinal.InvalidOrdinal;
 339     }
 340
 341     if (label.equalsToSerialized(prefixLen, labelRepository, offset)) {
 342       return a.cids[index];
 343     }
 344
 345     return Collision;
 346   }
 347
 348   /**
 349    * Returns index for hash code h.
 350    */
 351   static int indexFor(int h, int length) {
 352     return h & (length - 1);
 353   }
 354
 355   // static int stringHashCode(String label) {
 356   // int len = label.length();
 357   // int hash = 0;
 358   // int i;
 359   // for (i = 0; i < len; ++i)
 360   // hash = 33 * hash + label.charAt(i);
 361   //
 362   // hash = hash ^ ((hash >>> 20) ^ (hash >>> 12));
 363   // hash = hash ^ (hash >>> 7) ^ (hash >>> 4);
 364   //
 365   // return hash;
 366   //
 367   // }
 368
 369   static int stringHashCode(CategoryPath label) {
 370     int hash = label.hashCode();
 371
 372     hash = hash ^ ((hash >>> 20) ^ (hash >>> 12));
 373     hash = hash ^ (hash >>> 7) ^ (hash >>> 4);
 374
 375     return hash;
 376
 377   }
 378
 379   static int stringHashCode(CategoryPath label, int prefixLen) {
 380     int hash = label.hashCode(prefixLen);
 381
 382     hash = hash ^ ((hash >>> 20) ^ (hash >>> 12));
 383     hash = hash ^ (hash >>> 7) ^ (hash >>> 4);
 384
 385     return hash;
 386
 387   }
 388
 389   static int stringHashCode(CharBlockArray labelRepository, int offset) {
 390     int hash = CategoryPath.hashCodeOfSerialized(labelRepository, offset);
 391
 392     hash = hash ^ ((hash >>> 20) ^ (hash >>> 12));
 393     hash = hash ^ (hash >>> 7) ^ (hash >>> 4);
 394
 395     return hash;
 396   }
 397
 398   // public static boolean equals(CharSequence label, CharBlockArray array,
 399   // int offset) {
 400   // // CONTINUE HERE
 401   // int len = label.length();
 402   // int bi = array.blockIndex(offset);
 403   // CharBlockArray.Block b = array.blocks.get(bi);
 404   // int index = array.indexInBlock(offset);
 405   //
 406   // for (int i = 0; i < len; i++) {
 407   // if (label.charAt(i) != b.chars[index]) {
 408   // return false;
 409   // }
 410   // index++;
 411   // if (index == b.length) {
 412   // b = array.blocks.get(++bi);
 413   // index = 0;
 414   // }
 415   // }
 416   //
 417   // return b.chars[index] == TerminatorChar;
 418   // }
 419
 420   /**
 421    * Returns an estimate of the amount of memory used by this table. Called only in
 422    * this package. Memory is consumed mainly by three structures: the hash arrays,
 423    * label repository and collision map.
 424    */
 425   int getMemoryUsage() {
 426     int memoryUsage = 0;
 427     if (this.hashArrays != null) {
 428       // HashArray capacity is instance-specific.
 429       for (HashArray ha : this.hashArrays) {
 430         // Each has 2 capacity-length arrays of ints.
 431         memoryUsage += ( ha.capacity * 2 * 4 ) + 4;
 432       }
 433     }
 434     if (this.labelRepository != null) {
 435       // All blocks are the same size.
 436       int blockSize = this.labelRepository.blockSize;
 437       // Each block has room for blockSize UTF-16 chars.
 438       int actualBlockSize = ( blockSize * 2 ) + 4;
 439       memoryUsage += this.labelRepository.blocks.size() * actualBlockSize;
 440       memoryUsage += 8;   // Two int values for array as a whole.
 441     }
 442     if (this.collisionMap != null) {
 443       memoryUsage += this.collisionMap.getMemoryUsage();
 444     }
 445     return memoryUsage;
 446   }
 447
 448   /**
 449    * Opens the file and reloads the CompactLabelToOrdinal. The file it expects
 450    * is generated from the {@link #flush()} command.
 451    */
 452   static CompactLabelToOrdinal open(File file, float loadFactor,
 453                                     int numHashArrays) throws IOException {
 454     /**
 455      * Part of the file is the labelRepository, which needs to be rehashed
 456      * and label offsets re-added to the object. I am unsure as to why we
 457      * can't just store these off in the file as well, but in keeping with
 458      * the spirit of the original code, I did it this way. (ssuppe)
 459      */
 460     CompactLabelToOrdinal l2o = new CompactLabelToOrdinal();
 461     l2o.loadFactor = loadFactor;
 462     l2o.hashArrays = new HashArray[numHashArrays];
 463
 464     DataInputStream dis = null;
 465     try {
 466       dis = new DataInputStream(new BufferedInputStream(
 467           new FileInputStream(file)));
 468
 469       // TaxiReader needs to load the "counter" or occupancy (L2O) to know
 470       // the next unique facet. we used to load the delimiter too, but
 471       // never used it.
 472       l2o.counter = dis.readInt();
 473
 474       l2o.capacity = determineCapacity((int) Math.pow(2,
 475           l2o.hashArrays.length), l2o.counter);
 476       l2o.init();
 477
 478       // now read the chars
 479       l2o.labelRepository = CharBlockArray.open(dis);
 480
 481       l2o.collisionMap = new CollisionMap(l2o.labelRepository);
 482
 483       // Calculate hash on the fly based on how CategoryPath hashes
 484       // itself. Maybe in the future we can call some static based methods
 485       // in CategoryPath so that this doesn't break again? I don't like
 486       // having code in two different places...
 487       int cid = 0;
 488       // Skip the initial offset, it's the CategoryPath(0,0), which isn't
 489       // a hashed value.
 490       int offset = 1;
 491       int lastStartOffset = offset;
 492       // This loop really relies on a well-formed input (assumes pretty blindly
 493       // that array offsets will work).  Since the initial file is machine
 494       // generated, I think this should be OK.
 495       while (offset < l2o.labelRepository.length()) {
 496         // First component is numcomponents, so we initialize the hash
 497         // to this
 498         int ncomponents = l2o.labelRepository.charAt(offset++);
 499         int hash = ncomponents;
 500         // If ncomponents is 0, then we are done?
 501         if (ncomponents != 0) {
 502
 503           // usedchars is always the last member of the 'ends' array
 504           // in serialization. Rather than rebuild the entire array,
 505           // assign usedchars to the last value we read in. This will
 506           // be slightly more memory efficient.
 507           int usedchars = 0;
 508           for (int i = 0; i < ncomponents; i++) {
 509             usedchars = l2o.labelRepository.charAt(offset++);
 510             hash = hash * 31 + usedchars;
 511           }
 512           // Hash the usedchars for this label
 513           for (int i = 0; i < usedchars; i++) {
 514             hash = hash * 31 + l2o.labelRepository.charAt(offset++);
 515           }
 516         }
 517         // Now that we've hashed the components of the label, do the
 518         // final part of the hash algorithm.
 519         hash = hash ^ ((hash >>> 20) ^ (hash >>> 12));
 520         hash = hash ^ (hash >>> 7) ^ (hash >>> 4);
 521         // Add the label, and let's keep going
 522         l2o.addLabelOffset(hash, cid, lastStartOffset);
 523         cid++;
 524         lastStartOffset = offset;
 525       }
 526
 527     } catch (ClassNotFoundException cnfe) {
 528       throw new IOException("Invalid file format. Cannot deserialize.");
 529     } finally {
 530       if (dis != null) {
 531         dis.close();
 532       }
 533     }
 534
 535     l2o.threshold = (int) (l2o.loadFactor * l2o.capacity);
 536     return l2o;
 537
 538   }
 539
 540   void flush(File file) throws IOException {
 541     FileOutputStream fos = new FileOutputStream(file);
 542
 543     try {
 544       BufferedOutputStream os = new BufferedOutputStream(fos);
 545
 546       DataOutputStream dos = new DataOutputStream(os);
 547       dos.writeInt(this.counter);
 548
 549       // write the labelRepository
 550       this.labelRepository.flush(dos);
 551
 552       // Closes the data output stream
 553       dos.close();
 554
 555     } finally {
 556       fos.close();
 557     }
 558   }
 559
 560   private static final class HashArray {
 561     int[] offsets;
 562     int[] cids;
 563
 564     int capacity;
 565
 566     HashArray(int c) {
 567       this.capacity = c;
 568       this.offsets = new int[this.capacity];
 569       this.cids = new int[this.capacity];
 570     }
 571   }
 572 }