1 package org.apache.lucene.util;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
22 import org.apache.lucene.store.Directory;
23 import org.apache.lucene.store.IndexInput;
24 import org.apache.lucene.store.IndexOutput;
26 /** Optimized implementation of a vector of bits. This is more-or-less like
27 * java.util.BitSet, but also includes the following:
29 * <li>a count() method, which efficiently computes the number of one bits;</li>
30 * <li>optimized read from and write to disk;</li>
31 * <li>inlinable get() method;</li>
32 * <li>store and load, as bit set or d-gaps, depending on sparseness;</li>
37 public final class BitVector implements Cloneable, Bits {
43 /** Constructs a vector capable of holding <code>n</code> bits. */
44 public BitVector(int n) {
46 bits = new byte[getNumBytes(size)];
50 BitVector(byte[] bits, int size) {
56 private int getNumBytes(int size) {
57 int bytesLength = size >>> 3;
58 if ((size & 7) != 0) {
65 public Object clone() {
66 byte[] copyBits = new byte[bits.length];
67 System.arraycopy(bits, 0, copyBits, 0, bits.length);
68 BitVector clone = new BitVector(copyBits, size);
73 /** Sets the value of <code>bit</code> to one. */
74 public final void set(int bit) {
76 throw new ArrayIndexOutOfBoundsException("bit=" + bit + " size=" + size);
78 bits[bit >> 3] |= 1 << (bit & 7);
82 /** Sets the value of <code>bit</code> to true, and
83 * returns true if bit was already set */
84 public final boolean getAndSet(int bit) {
86 throw new ArrayIndexOutOfBoundsException("bit=" + bit + " size=" + size);
88 final int pos = bit >> 3;
89 final int v = bits[pos];
90 final int flag = 1 << (bit & 7);
94 bits[pos] = (byte) (v | flag);
101 /** Sets the value of <code>bit</code> to zero. */
102 public final void clear(int bit) {
104 throw new ArrayIndexOutOfBoundsException(bit);
106 bits[bit >> 3] &= ~(1 << (bit & 7));
110 /** Returns <code>true</code> if <code>bit</code> is one and
111 <code>false</code> if it is zero. */
112 public final boolean get(int bit) {
113 assert bit >= 0 && bit < size: "bit " + bit + " is out of bounds 0.." + (size-1);
114 return (bits[bit >> 3] & (1 << (bit & 7))) != 0;
117 /** Returns the number of bits in this vector. This is also one greater than
118 the number of the largest valid bit number. */
119 public final int size() {
123 /** Returns the number of bits in this vector. This is also one greater than
124 the number of the largest valid bit number. */
125 public final int length() {
129 /** Returns the total number of one bits in this vector. This is efficiently
130 computed and cached, so that, if the vector is not changed, no
131 recomputation is done for repeated calls. */
132 public final int count() {
133 // if the vector has been modified
136 int end = bits.length;
137 for (int i = 0; i < end; i++)
138 c += BYTE_COUNTS[bits[i] & 0xFF]; // sum bits per byte
145 public final int getRecomputedCount() {
147 int end = bits.length;
148 for (int i = 0; i < end; i++)
149 c += BYTE_COUNTS[bits[i] & 0xFF]; // sum bits per byte
153 private static final byte[] BYTE_COUNTS = { // table of bits/byte
154 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
155 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
156 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
157 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
158 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
159 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
160 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
161 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
162 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
163 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
164 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
165 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
166 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
167 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
168 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
169 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
172 private static String CODEC = "BitVector";
174 // Version before version tracking was added:
175 private final static int VERSION_PRE = -1;
178 private final static int VERSION_START = 0;
180 // Increment version to change it:
181 private final static int VERSION_CURRENT = VERSION_START;
183 /** Writes this vector to the file <code>name</code> in Directory
184 <code>d</code>, in a format that can be read by the constructor {@link
185 #BitVector(Directory, String)}. */
186 public final void write(Directory d, String name) throws IOException {
187 IndexOutput output = d.createOutput(name);
190 CodecUtil.writeHeader(output, CODEC, VERSION_CURRENT);
192 writeDgaps(output); // sparse bit-set more efficiently saved as d-gaps.
201 /** Write as a bit set */
202 private void writeBits(IndexOutput output) throws IOException {
203 output.writeInt(size()); // write size
204 output.writeInt(count()); // write count
205 output.writeBytes(bits, bits.length);
208 /** Write as a d-gaps list */
209 private void writeDgaps(IndexOutput output) throws IOException {
210 output.writeInt(-1); // mark using d-gaps
211 output.writeInt(size()); // write size
212 output.writeInt(count()); // write count
216 for (int i=0; i<m && n>0; i++) {
218 output.writeVInt(i-last);
219 output.writeByte(bits[i]);
221 n -= BYTE_COUNTS[bits[i] & 0xFF];
226 /** Indicates if the bit vector is sparse and should be saved as a d-gaps list, or dense, and should be saved as a bit set. */
227 private boolean isSparse() {
229 final int setCount = count();
234 final int avgGapLength = bits.length / setCount;
236 // expected number of bytes for vInt encoding of each gap
237 final int expectedDGapBytes;
238 if (avgGapLength <= (1<< 7)) {
239 expectedDGapBytes = 1;
240 } else if (avgGapLength <= (1<<14)) {
241 expectedDGapBytes = 2;
242 } else if (avgGapLength <= (1<<21)) {
243 expectedDGapBytes = 3;
244 } else if (avgGapLength <= (1<<28)) {
245 expectedDGapBytes = 4;
247 expectedDGapBytes = 5;
250 // +1 because we write the byte itself that contains the
252 final int bytesPerSetBit = expectedDGapBytes + 1;
254 // note: adding 32 because we start with ((int) -1) to indicate d-gaps format.
255 final long expectedBits = 32 + 8 * bytesPerSetBit * count();
257 // note: factor is for read/write of byte-arrays being faster than vints.
258 final long factor = 10;
259 return factor * expectedBits < size();
262 /** Constructs a bit vector from the file <code>name</code> in Directory
263 <code>d</code>, as written by the {@link #write} method.
265 public BitVector(Directory d, String name) throws IOException {
266 IndexInput input = d.openInput(name);
269 final int firstInt = input.readInt();
271 if (firstInt == -2) {
272 // New format, with full header & version:
273 version = CodecUtil.checkHeader(input, CODEC, VERSION_START, VERSION_START);
274 size = input.readInt();
276 version = VERSION_PRE;
289 /** Read as a bit set */
290 private void readBits(IndexInput input) throws IOException {
291 count = input.readInt(); // read count
292 bits = new byte[getNumBytes(size)]; // allocate bits
293 input.readBytes(bits, 0, bits.length);
296 /** read as a d-gaps list */
297 private void readDgaps(IndexInput input) throws IOException {
298 size = input.readInt(); // (re)read size
299 count = input.readInt(); // read count
300 bits = new byte[(size >> 3) + 1]; // allocate bits
304 last += input.readVInt();
305 bits[last] = input.readByte();
306 n -= BYTE_COUNTS[bits[last] & 0xFF];