1 package org.apache.lucene.util;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.nio.CharBuffer;
21 import java.nio.ByteBuffer;
24 * Provides support for converting byte sequences to Strings and back again.
25 * The resulting Strings preserve the original byte sequences' sort order.
27 * The Strings are constructed using a Base 8000h encoding of the original
28 * binary data - each char of an encoded String represents a 15-bit chunk
29 * from the byte sequence. Base 8000h was chosen because it allows for all
30 * lower 15 bits of char to be used without restriction; the surrogate range
31 * [U+D8000-U+DFFF] does not represent valid chars, and would require
32 * complicated handling to avoid them and allow use of char's high bit.
34 * Although unset bits are used as padding in the final char, the original
35 * byte sequence could contain trailing bytes with no set bits (null bytes):
36 * padding is indistinguishable from valid information. To overcome this
37 * problem, a char is appended, indicating the number of encoded bytes in the
40 * Some methods in this class are defined over CharBuffers and ByteBuffers, but
41 * these are deprecated in favor of methods that operate directly on byte[] and
42 * char[] arrays. Note that this class calls array() and arrayOffset()
43 * on the CharBuffers and ByteBuffers it uses, so only wrapped arrays may be
44 * used. This class interprets the arrayOffset() and limit() values returned
45 * by its input buffers as beginning and end+1 positions on the wrapped array,
46 * respectively; similarly, on the output buffer, arrayOffset() is the first
47 * position written to, and limit() is set to one past the final output array
50 * WARNING: This means that the deprecated Buffer-based methods
51 * only work correctly with buffers that have an offset of 0. For example, they
52 * will not correctly interpret buffers returned by {@link ByteBuffer#slice}.
54 * @lucene.experimental
56 public final class IndexableBinaryStringTools {
58 private static final CodingCase[] CODING_CASES = {
59 // CodingCase(int initialShift, int finalShift)
60 new CodingCase( 7, 1 ),
61 // CodingCase(int initialShift, int middleShift, int finalShift)
62 new CodingCase(14, 6, 2),
63 new CodingCase(13, 5, 3),
64 new CodingCase(12, 4, 4),
65 new CodingCase(11, 3, 5),
66 new CodingCase(10, 2, 6),
67 new CodingCase( 9, 1, 7),
68 new CodingCase( 8, 0 )
71 // Export only static methods
72 private IndexableBinaryStringTools() {}
75 * Returns the number of chars required to encode the given byte sequence.
77 * @param original The byte sequence to be encoded. Must be backed by an
79 * @return The number of chars required to encode the given byte sequence
80 * @throws IllegalArgumentException If the given ByteBuffer is not backed by
82 * @deprecated Use {@link #getEncodedLength(byte[], int, int)} instead. This
83 * method will be removed in Lucene 4.0
86 public static int getEncodedLength(ByteBuffer original)
87 throws IllegalArgumentException {
88 if (original.hasArray()) {
89 return getEncodedLength(original.array(), original.arrayOffset(),
90 original.limit() - original.arrayOffset());
92 throw new IllegalArgumentException("original argument must have a backing array");
97 * Returns the number of chars required to encode the given bytes.
99 * @param inputArray byte sequence to be encoded
100 * @param inputOffset initial offset into inputArray
101 * @param inputLength number of bytes in inputArray
102 * @return The number of chars required to encode the number of bytes.
104 public static int getEncodedLength(byte[] inputArray, int inputOffset,
106 // Use long for intermediaries to protect against overflow
107 return (int)((8L * inputLength + 14L) / 15L) + 1;
112 * Returns the number of bytes required to decode the given char sequence.
114 * @param encoded The char sequence to be decoded. Must be backed by an array.
115 * @return The number of bytes required to decode the given char sequence
116 * @throws IllegalArgumentException If the given CharBuffer is not backed by
118 * @deprecated Use {@link #getDecodedLength(char[], int, int)} instead. This
119 * method will be removed in Lucene 4.0
122 public static int getDecodedLength(CharBuffer encoded)
123 throws IllegalArgumentException {
124 if (encoded.hasArray()) {
125 return getDecodedLength(encoded.array(), encoded.arrayOffset(),
126 encoded.limit() - encoded.arrayOffset());
128 throw new IllegalArgumentException("encoded argument must have a backing array");
133 * Returns the number of bytes required to decode the given char sequence.
135 * @param encoded char sequence to be decoded
136 * @param offset initial offset
137 * @param length number of characters
138 * @return The number of bytes required to decode the given char sequence
140 public static int getDecodedLength(char[] encoded, int offset, int length) {
141 final int numChars = length - 1;
145 // Use long for intermediaries to protect against overflow
146 final long numFullBytesInFinalChar = encoded[offset + length - 1];
147 final long numEncodedChars = numChars - 1;
148 return (int)((numEncodedChars * 15L + 7L) / 8L + numFullBytesInFinalChar);
153 * Encodes the input byte sequence into the output char sequence. Before
154 * calling this method, ensure that the output CharBuffer has sufficient
155 * capacity by calling {@link #getEncodedLength(java.nio.ByteBuffer)}.
157 * @param input The byte sequence to encode
158 * @param output Where the char sequence encoding result will go. The limit is
159 * set to one past the position of the final char.
160 * @throws IllegalArgumentException If either the input or the output buffer
161 * is not backed by an array
162 * @deprecated Use {@link #encode(byte[], int, int, char[], int, int)}
163 * instead. This method will be removed in Lucene 4.0
166 public static void encode(ByteBuffer input, CharBuffer output) {
167 if (input.hasArray() && output.hasArray()) {
168 final int inputOffset = input.arrayOffset();
169 final int inputLength = input.limit() - inputOffset;
170 final int outputOffset = output.arrayOffset();
171 final int outputLength = getEncodedLength(input.array(), inputOffset,
173 output.limit(outputLength + outputOffset);
175 encode(input.array(), inputOffset, inputLength, output.array(),
176 outputOffset, outputLength);
178 throw new IllegalArgumentException("Arguments must have backing arrays");
183 * Encodes the input byte sequence into the output char sequence. Before
184 * calling this method, ensure that the output array has sufficient
185 * capacity by calling {@link #getEncodedLength(byte[], int, int)}.
187 * @param inputArray byte sequence to be encoded
188 * @param inputOffset initial offset into inputArray
189 * @param inputLength number of bytes in inputArray
190 * @param outputArray char sequence to store encoded result
191 * @param outputOffset initial offset into outputArray
192 * @param outputLength length of output, must be getEncodedLength
194 public static void encode(byte[] inputArray, int inputOffset,
195 int inputLength, char[] outputArray, int outputOffset, int outputLength) {
196 assert (outputLength == getEncodedLength(inputArray, inputOffset,
198 if (inputLength > 0) {
199 int inputByteNum = inputOffset;
201 int outputCharNum = outputOffset;
202 CodingCase codingCase;
203 for (; inputByteNum + CODING_CASES[caseNum].numBytes <= inputLength; ++outputCharNum) {
204 codingCase = CODING_CASES[caseNum];
205 if (2 == codingCase.numBytes) {
206 outputArray[outputCharNum] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift)
207 + (((inputArray[inputByteNum + 1] & 0xFF) >>> codingCase.finalShift) & codingCase.finalMask) & (short) 0x7FFF);
208 } else { // numBytes is 3
209 outputArray[outputCharNum] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift)
210 + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift)
211 + (((inputArray[inputByteNum + 2] & 0xFF) >>> codingCase.finalShift) & codingCase.finalMask) & (short) 0x7FFF);
213 inputByteNum += codingCase.advanceBytes;
214 if (++caseNum == CODING_CASES.length) {
218 // Produce final char (if any) and trailing count chars.
219 codingCase = CODING_CASES[caseNum];
221 if (inputByteNum + 1 < inputLength) { // codingCase.numBytes must be 3
222 outputArray[outputCharNum++] = (char) ((((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift)) & (short) 0x7FFF);
223 // Add trailing char containing the number of full bytes in final char
224 outputArray[outputCharNum++] = (char) 1;
225 } else if (inputByteNum < inputLength) {
226 outputArray[outputCharNum++] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) & (short) 0x7FFF);
227 // Add trailing char containing the number of full bytes in final char
228 outputArray[outputCharNum++] = caseNum == 0 ? (char) 1 : (char) 0;
229 } else { // No left over bits - last char is completely filled.
230 // Add trailing char containing the number of full bytes in final char
231 outputArray[outputCharNum++] = (char) 1;
237 * Decodes the input char sequence into the output byte sequence. Before
238 * calling this method, ensure that the output ByteBuffer has sufficient
239 * capacity by calling {@link #getDecodedLength(java.nio.CharBuffer)}.
241 * @param input The char sequence to decode
242 * @param output Where the byte sequence decoding result will go. The limit is
243 * set to one past the position of the final char.
244 * @throws IllegalArgumentException If either the input or the output buffer
245 * is not backed by an array
246 * @deprecated Use {@link #decode(char[], int, int, byte[], int, int)}
247 * instead. This method will be removed in Lucene 4.0
250 public static void decode(CharBuffer input, ByteBuffer output) {
251 if (input.hasArray() && output.hasArray()) {
252 final int inputOffset = input.arrayOffset();
253 final int inputLength = input.limit() - inputOffset;
254 final int outputOffset = output.arrayOffset();
255 final int outputLength = getDecodedLength(input.array(), inputOffset,
257 output.limit(outputLength + outputOffset);
259 decode(input.array(), inputOffset, inputLength, output.array(),
260 outputOffset, outputLength);
262 throw new IllegalArgumentException("Arguments must have backing arrays");
267 * Decodes the input char sequence into the output byte sequence. Before
268 * calling this method, ensure that the output array has sufficient capacity
269 * by calling {@link #getDecodedLength(char[], int, int)}.
271 * @param inputArray char sequence to be decoded
272 * @param inputOffset initial offset into inputArray
273 * @param inputLength number of chars in inputArray
274 * @param outputArray byte sequence to store encoded result
275 * @param outputOffset initial offset into outputArray
276 * @param outputLength length of output, must be
277 * getDecodedLength(inputArray, inputOffset, inputLength)
279 public static void decode(char[] inputArray, int inputOffset,
280 int inputLength, byte[] outputArray, int outputOffset, int outputLength) {
281 assert (outputLength == getDecodedLength(inputArray, inputOffset,
283 final int numInputChars = inputLength - 1;
284 final int numOutputBytes = outputLength;
286 if (numOutputBytes > 0) {
288 int outputByteNum = outputOffset;
289 int inputCharNum = inputOffset;
291 CodingCase codingCase;
292 for (; inputCharNum < numInputChars - 1; ++inputCharNum) {
293 codingCase = CODING_CASES[caseNum];
294 inputChar = (short) inputArray[inputCharNum];
295 if (2 == codingCase.numBytes) {
297 outputArray[outputByteNum] = (byte) (inputChar >>> codingCase.initialShift);
299 outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift);
301 outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
302 } else { // numBytes is 3
303 outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift);
304 outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.middleMask) >>> codingCase.middleShift);
305 outputArray[outputByteNum + 2] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
307 outputByteNum += codingCase.advanceBytes;
308 if (++caseNum == CODING_CASES.length) {
313 inputChar = (short) inputArray[inputCharNum];
314 codingCase = CODING_CASES[caseNum];
316 outputArray[outputByteNum] = 0;
318 outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift);
319 final int bytesLeft = numOutputBytes - outputByteNum;
321 if (2 == codingCase.numBytes) {
322 outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.finalMask) >>> codingCase.finalShift);
323 } else { // numBytes is 3
324 outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.middleMask) >>> codingCase.middleShift);
326 outputArray[outputByteNum + 2] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
334 * Decodes the given char sequence, which must have been encoded by
335 * {@link #encode(java.nio.ByteBuffer)} or
336 * {@link #encode(java.nio.ByteBuffer, java.nio.CharBuffer)}.
338 * @param input The char sequence to decode
339 * @return A byte sequence containing the decoding result. The limit is set to
340 * one past the position of the final char.
341 * @throws IllegalArgumentException If the input buffer is not backed by an
343 * @deprecated Use {@link #decode(char[], int, int, byte[], int, int)}
344 * instead. This method will be removed in Lucene 4.0
347 public static ByteBuffer decode(CharBuffer input) {
348 byte[] outputArray = new byte[getDecodedLength(input)];
349 ByteBuffer output = ByteBuffer.wrap(outputArray);
350 decode(input, output);
355 * Encodes the input byte sequence.
357 * @param input The byte sequence to encode
358 * @return A char sequence containing the encoding result. The limit is set to
359 * one past the position of the final char.
360 * @throws IllegalArgumentException If the input buffer is not backed by an
362 * @deprecated Use {@link #encode(byte[], int, int, char[], int, int)}
363 * instead. This method will be removed in Lucene 4.0
366 public static CharBuffer encode(ByteBuffer input) {
367 char[] outputArray = new char[getEncodedLength(input)];
368 CharBuffer output = CharBuffer.wrap(outputArray);
369 encode(input, output);
373 static class CodingCase {
374 int numBytes, initialShift, middleShift, finalShift, advanceBytes = 2;
375 short middleMask, finalMask;
377 CodingCase(int initialShift, int middleShift, int finalShift) {
379 this.initialShift = initialShift;
380 this.middleShift = middleShift;
381 this.finalShift = finalShift;
382 this.finalMask = (short)((short)0xFF >>> finalShift);
383 this.middleMask = (short)((short)0xFF << middleShift);
386 CodingCase(int initialShift, int finalShift) {
388 this.initialShift = initialShift;
389 this.finalShift = finalShift;
390 this.finalMask = (short)((short)0xFF >>> finalShift);
391 if (finalShift != 0) {