1 package org.apache.lucene.util;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
22 import org.apache.lucene.store.Directory;
23 import org.apache.lucene.store.IndexInput;
24 import org.apache.lucene.store.IndexOutput;
26 /** Optimized implementation of a vector of bits. This is more-or-less like
27 * java.util.BitSet, but also includes the following:
29 * <li>a count() method, which efficiently computes the number of one bits;</li>
30 * <li>optimized read from and write to disk;</li>
31 * <li>inlinable get() method;</li>
32 * <li>store and load, as bit set or d-gaps, depending on sparseness;</li>
37 public final class BitVector implements Cloneable {
43 /** Constructs a vector capable of holding <code>n</code> bits. */
44 public BitVector(int n) {
46 bits = new byte[getNumBytes(size)];
50 BitVector(byte[] bits, int size) {
56 private int getNumBytes(int size) {
57 int bytesLength = size >>> 3;
58 if ((size & 7) != 0) {
65 public Object clone() {
66 byte[] copyBits = new byte[bits.length];
67 System.arraycopy(bits, 0, copyBits, 0, bits.length);
68 BitVector clone = new BitVector(copyBits, size);
73 /** Sets the value of <code>bit</code> to one. */
74 public final void set(int bit) {
76 throw new ArrayIndexOutOfBoundsException("bit=" + bit + " size=" + size);
78 bits[bit >> 3] |= 1 << (bit & 7);
82 /** Sets the value of <code>bit</code> to true, and
83 * returns true if bit was already set */
84 public final boolean getAndSet(int bit) {
86 throw new ArrayIndexOutOfBoundsException("bit=" + bit + " size=" + size);
88 final int pos = bit >> 3;
89 final int v = bits[pos];
90 final int flag = 1 << (bit & 7);
94 bits[pos] = (byte) (v | flag);
101 /** Sets the value of <code>bit</code> to zero. */
102 public final void clear(int bit) {
104 throw new ArrayIndexOutOfBoundsException(bit);
106 bits[bit >> 3] &= ~(1 << (bit & 7));
110 /** Returns <code>true</code> if <code>bit</code> is one and
111 <code>false</code> if it is zero. */
112 public final boolean get(int bit) {
113 assert bit >= 0 && bit < size: "bit " + bit + " is out of bounds 0.." + (size-1);
114 return (bits[bit >> 3] & (1 << (bit & 7))) != 0;
117 /** Returns the number of bits in this vector. This is also one greater than
118 the number of the largest valid bit number. */
119 public final int size() {
123 /** Returns the total number of one bits in this vector. This is efficiently
124 computed and cached, so that, if the vector is not changed, no
125 recomputation is done for repeated calls. */
126 public final int count() {
127 // if the vector has been modified
130 int end = bits.length;
131 for (int i = 0; i < end; i++)
132 c += BYTE_COUNTS[bits[i] & 0xFF]; // sum bits per byte
139 public final int getRecomputedCount() {
141 int end = bits.length;
142 for (int i = 0; i < end; i++)
143 c += BYTE_COUNTS[bits[i] & 0xFF]; // sum bits per byte
147 private static final byte[] BYTE_COUNTS = { // table of bits/byte
148 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
149 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
150 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
151 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
152 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
153 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
154 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
155 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
156 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
157 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
158 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
159 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
160 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
161 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
162 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
163 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
166 private static String CODEC = "BitVector";
168 // Version before version tracking was added:
169 private final static int VERSION_PRE = -1;
172 private final static int VERSION_START = 0;
174 // Increment version to change it:
175 private final static int VERSION_CURRENT = VERSION_START;
177 /** Writes this vector to the file <code>name</code> in Directory
178 <code>d</code>, in a format that can be read by the constructor {@link
179 #BitVector(Directory, String)}. */
180 public final void write(Directory d, String name) throws IOException {
181 IndexOutput output = d.createOutput(name);
184 CodecUtil.writeHeader(output, CODEC, VERSION_CURRENT);
186 writeDgaps(output); // sparse bit-set more efficiently saved as d-gaps.
195 /** Write as a bit set */
196 private void writeBits(IndexOutput output) throws IOException {
197 output.writeInt(size()); // write size
198 output.writeInt(count()); // write count
199 output.writeBytes(bits, bits.length);
202 /** Write as a d-gaps list */
203 private void writeDgaps(IndexOutput output) throws IOException {
204 output.writeInt(-1); // mark using d-gaps
205 output.writeInt(size()); // write size
206 output.writeInt(count()); // write count
210 for (int i=0; i<m && n>0; i++) {
212 output.writeVInt(i-last);
213 output.writeByte(bits[i]);
215 n -= BYTE_COUNTS[bits[i] & 0xFF];
220 /** Indicates if the bit vector is sparse and should be saved as a d-gaps list, or dense, and should be saved as a bit set. */
221 private boolean isSparse() {
223 final int setCount = count();
228 final int avgGapLength = bits.length / setCount;
230 // expected number of bytes for vInt encoding of each gap
231 final int expectedDGapBytes;
232 if (avgGapLength <= (1<< 7)) {
233 expectedDGapBytes = 1;
234 } else if (avgGapLength <= (1<<14)) {
235 expectedDGapBytes = 2;
236 } else if (avgGapLength <= (1<<21)) {
237 expectedDGapBytes = 3;
238 } else if (avgGapLength <= (1<<28)) {
239 expectedDGapBytes = 4;
241 expectedDGapBytes = 5;
244 // +1 because we write the byte itself that contains the
246 final int bytesPerSetBit = expectedDGapBytes + 1;
248 // note: adding 32 because we start with ((int) -1) to indicate d-gaps format.
249 final long expectedBits = 32 + 8 * bytesPerSetBit * count();
251 // note: factor is for read/write of byte-arrays being faster than vints.
252 final long factor = 10;
253 return factor * expectedBits < size();
256 /** Constructs a bit vector from the file <code>name</code> in Directory
257 <code>d</code>, as written by the {@link #write} method.
259 public BitVector(Directory d, String name) throws IOException {
260 IndexInput input = d.openInput(name);
263 final int firstInt = input.readInt();
265 if (firstInt == -2) {
266 // New format, with full header & version:
267 version = CodecUtil.checkHeader(input, CODEC, VERSION_START, VERSION_START);
268 size = input.readInt();
270 version = VERSION_PRE;
283 /** Read as a bit set */
284 private void readBits(IndexInput input) throws IOException {
285 count = input.readInt(); // read count
286 bits = new byte[getNumBytes(size)]; // allocate bits
287 input.readBytes(bits, 0, bits.length);
290 /** read as a d-gaps list */
291 private void readDgaps(IndexInput input) throws IOException {
292 size = input.readInt(); // (re)read size
293 count = input.readInt(); // read count
294 bits = new byte[(size >> 3) + 1]; // allocate bits
298 last += input.readVInt();
299 bits[last] = input.readByte();
300 n -= BYTE_COUNTS[bits[last] & 0xFF];