--- /dev/null
+package org.apache.lucene.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.nio.CharBuffer;
+import java.nio.ByteBuffer;
+
+/**
+ * Provides support for converting byte sequences to Strings and back again.
+ * The resulting Strings preserve the original byte sequences' sort order.
+ * <p/>
+ * The Strings are constructed using a Base 8000h encoding of the original
+ * binary data - each char of an encoded String represents a 15-bit chunk
+ * from the byte sequence. Base 8000h was chosen because it allows for all
+ * lower 15 bits of char to be used without restriction; the surrogate range
+ * [U+D8000-U+DFFF] does not represent valid chars, and would require
+ * complicated handling to avoid them and allow use of char's high bit.
+ * <p/>
+ * Although unset bits are used as padding in the final char, the original
+ * byte sequence could contain trailing bytes with no set bits (null bytes):
+ * padding is indistinguishable from valid information. To overcome this
+ * problem, a char is appended, indicating the number of encoded bytes in the
+ * final content char.
+ * <p/>
+ * Some methods in this class are defined over CharBuffers and ByteBuffers, but
+ * these are deprecated in favor of methods that operate directly on byte[] and
+ * char[] arrays. Note that this class calls array() and arrayOffset()
+ * on the CharBuffers and ByteBuffers it uses, so only wrapped arrays may be
+ * used. This class interprets the arrayOffset() and limit() values returned
+ * by its input buffers as beginning and end+1 positions on the wrapped array,
+ * respectively; similarly, on the output buffer, arrayOffset() is the first
+ * position written to, and limit() is set to one past the final output array
+ * position.
+ * <p/>
+ * WARNING: This means that the deprecated Buffer-based methods
+ * only work correctly with buffers that have an offset of 0. For example, they
+ * will not correctly interpret buffers returned by {@link ByteBuffer#slice}.
+ *
+ * @lucene.experimental
+ */
+public final class IndexableBinaryStringTools {
+
+ private static final CodingCase[] CODING_CASES = {
+ // CodingCase(int initialShift, int finalShift)
+ new CodingCase( 7, 1 ),
+ // CodingCase(int initialShift, int middleShift, int finalShift)
+ new CodingCase(14, 6, 2),
+ new CodingCase(13, 5, 3),
+ new CodingCase(12, 4, 4),
+ new CodingCase(11, 3, 5),
+ new CodingCase(10, 2, 6),
+ new CodingCase( 9, 1, 7),
+ new CodingCase( 8, 0 )
+ };
+
+ // Export only static methods
+ private IndexableBinaryStringTools() {}
+
+ /**
+ * Returns the number of chars required to encode the given byte sequence.
+ *
+ * @param original The byte sequence to be encoded. Must be backed by an
+ * array.
+ * @return The number of chars required to encode the given byte sequence
+ * @throws IllegalArgumentException If the given ByteBuffer is not backed by
+ * an array
+ * @deprecated Use {@link #getEncodedLength(byte[], int, int)} instead. This
+ * method will be removed in Lucene 4.0
+ */
+ @Deprecated
+ public static int getEncodedLength(ByteBuffer original)
+ throws IllegalArgumentException {
+ if (original.hasArray()) {
+ return getEncodedLength(original.array(), original.arrayOffset(),
+ original.limit() - original.arrayOffset());
+ } else {
+ throw new IllegalArgumentException("original argument must have a backing array");
+ }
+ }
+
+ /**
+ * Returns the number of chars required to encode the given bytes.
+ *
+ * @param inputArray byte sequence to be encoded
+ * @param inputOffset initial offset into inputArray
+ * @param inputLength number of bytes in inputArray
+ * @return The number of chars required to encode the number of bytes.
+ */
+ public static int getEncodedLength(byte[] inputArray, int inputOffset,
+ int inputLength) {
+ // Use long for intermediaries to protect against overflow
+ return (int)((8L * inputLength + 14L) / 15L) + 1;
+ }
+
+
+ /**
+ * Returns the number of bytes required to decode the given char sequence.
+ *
+ * @param encoded The char sequence to be decoded. Must be backed by an array.
+ * @return The number of bytes required to decode the given char sequence
+ * @throws IllegalArgumentException If the given CharBuffer is not backed by
+ * an array
+ * @deprecated Use {@link #getDecodedLength(char[], int, int)} instead. This
+ * method will be removed in Lucene 4.0
+ */
+ @Deprecated
+ public static int getDecodedLength(CharBuffer encoded)
+ throws IllegalArgumentException {
+ if (encoded.hasArray()) {
+ return getDecodedLength(encoded.array(), encoded.arrayOffset(),
+ encoded.limit() - encoded.arrayOffset());
+ } else {
+ throw new IllegalArgumentException("encoded argument must have a backing array");
+ }
+ }
+
+ /**
+ * Returns the number of bytes required to decode the given char sequence.
+ *
+ * @param encoded char sequence to be decoded
+ * @param offset initial offset
+ * @param length number of characters
+ * @return The number of bytes required to decode the given char sequence
+ */
+ public static int getDecodedLength(char[] encoded, int offset, int length) {
+ final int numChars = length - 1;
+ if (numChars <= 0) {
+ return 0;
+ } else {
+ // Use long for intermediaries to protect against overflow
+ final long numFullBytesInFinalChar = encoded[offset + length - 1];
+ final long numEncodedChars = numChars - 1;
+ return (int)((numEncodedChars * 15L + 7L) / 8L + numFullBytesInFinalChar);
+ }
+ }
+
+ /**
+ * Encodes the input byte sequence into the output char sequence. Before
+ * calling this method, ensure that the output CharBuffer has sufficient
+ * capacity by calling {@link #getEncodedLength(java.nio.ByteBuffer)}.
+ *
+ * @param input The byte sequence to encode
+ * @param output Where the char sequence encoding result will go. The limit is
+ * set to one past the position of the final char.
+ * @throws IllegalArgumentException If either the input or the output buffer
+ * is not backed by an array
+ * @deprecated Use {@link #encode(byte[], int, int, char[], int, int)}
+ * instead. This method will be removed in Lucene 4.0
+ */
+ @Deprecated
+ public static void encode(ByteBuffer input, CharBuffer output) {
+ if (input.hasArray() && output.hasArray()) {
+ final int inputOffset = input.arrayOffset();
+ final int inputLength = input.limit() - inputOffset;
+ final int outputOffset = output.arrayOffset();
+ final int outputLength = getEncodedLength(input.array(), inputOffset,
+ inputLength);
+ output.limit(outputLength + outputOffset);
+ output.position(0);
+ encode(input.array(), inputOffset, inputLength, output.array(),
+ outputOffset, outputLength);
+ } else {
+ throw new IllegalArgumentException("Arguments must have backing arrays");
+ }
+ }
+
+ /**
+ * Encodes the input byte sequence into the output char sequence. Before
+ * calling this method, ensure that the output array has sufficient
+ * capacity by calling {@link #getEncodedLength(byte[], int, int)}.
+ *
+ * @param inputArray byte sequence to be encoded
+ * @param inputOffset initial offset into inputArray
+ * @param inputLength number of bytes in inputArray
+ * @param outputArray char sequence to store encoded result
+ * @param outputOffset initial offset into outputArray
+ * @param outputLength length of output, must be getEncodedLength
+ */
+ public static void encode(byte[] inputArray, int inputOffset,
+ int inputLength, char[] outputArray, int outputOffset, int outputLength) {
+ assert (outputLength == getEncodedLength(inputArray, inputOffset,
+ inputLength));
+ if (inputLength > 0) {
+ int inputByteNum = inputOffset;
+ int caseNum = 0;
+ int outputCharNum = outputOffset;
+ CodingCase codingCase;
+ for (; inputByteNum + CODING_CASES[caseNum].numBytes <= inputLength; ++outputCharNum) {
+ codingCase = CODING_CASES[caseNum];
+ if (2 == codingCase.numBytes) {
+ outputArray[outputCharNum] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift)
+ + (((inputArray[inputByteNum + 1] & 0xFF) >>> codingCase.finalShift) & codingCase.finalMask) & (short) 0x7FFF);
+ } else { // numBytes is 3
+ outputArray[outputCharNum] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift)
+ + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift)
+ + (((inputArray[inputByteNum + 2] & 0xFF) >>> codingCase.finalShift) & codingCase.finalMask) & (short) 0x7FFF);
+ }
+ inputByteNum += codingCase.advanceBytes;
+ if (++caseNum == CODING_CASES.length) {
+ caseNum = 0;
+ }
+ }
+ // Produce final char (if any) and trailing count chars.
+ codingCase = CODING_CASES[caseNum];
+
+ if (inputByteNum + 1 < inputLength) { // codingCase.numBytes must be 3
+ outputArray[outputCharNum++] = (char) ((((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift)) & (short) 0x7FFF);
+ // Add trailing char containing the number of full bytes in final char
+ outputArray[outputCharNum++] = (char) 1;
+ } else if (inputByteNum < inputLength) {
+ outputArray[outputCharNum++] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) & (short) 0x7FFF);
+ // Add trailing char containing the number of full bytes in final char
+ outputArray[outputCharNum++] = caseNum == 0 ? (char) 1 : (char) 0;
+ } else { // No left over bits - last char is completely filled.
+ // Add trailing char containing the number of full bytes in final char
+ outputArray[outputCharNum++] = (char) 1;
+ }
+ }
+ }
+
+ /**
+ * Decodes the input char sequence into the output byte sequence. Before
+ * calling this method, ensure that the output ByteBuffer has sufficient
+ * capacity by calling {@link #getDecodedLength(java.nio.CharBuffer)}.
+ *
+ * @param input The char sequence to decode
+ * @param output Where the byte sequence decoding result will go. The limit is
+ * set to one past the position of the final char.
+ * @throws IllegalArgumentException If either the input or the output buffer
+ * is not backed by an array
+ * @deprecated Use {@link #decode(char[], int, int, byte[], int, int)}
+ * instead. This method will be removed in Lucene 4.0
+ */
+ @Deprecated
+ public static void decode(CharBuffer input, ByteBuffer output) {
+ if (input.hasArray() && output.hasArray()) {
+ final int inputOffset = input.arrayOffset();
+ final int inputLength = input.limit() - inputOffset;
+ final int outputOffset = output.arrayOffset();
+ final int outputLength = getDecodedLength(input.array(), inputOffset,
+ inputLength);
+ output.limit(outputLength + outputOffset);
+ output.position(0);
+ decode(input.array(), inputOffset, inputLength, output.array(),
+ outputOffset, outputLength);
+ } else {
+ throw new IllegalArgumentException("Arguments must have backing arrays");
+ }
+ }
+
+ /**
+ * Decodes the input char sequence into the output byte sequence. Before
+ * calling this method, ensure that the output array has sufficient capacity
+ * by calling {@link #getDecodedLength(char[], int, int)}.
+ *
+ * @param inputArray char sequence to be decoded
+ * @param inputOffset initial offset into inputArray
+ * @param inputLength number of chars in inputArray
+ * @param outputArray byte sequence to store encoded result
+ * @param outputOffset initial offset into outputArray
+ * @param outputLength length of output, must be
+ * getDecodedLength(inputArray, inputOffset, inputLength)
+ */
+ public static void decode(char[] inputArray, int inputOffset,
+ int inputLength, byte[] outputArray, int outputOffset, int outputLength) {
+ assert (outputLength == getDecodedLength(inputArray, inputOffset,
+ inputLength));
+ final int numInputChars = inputLength - 1;
+ final int numOutputBytes = outputLength;
+
+ if (numOutputBytes > 0) {
+ int caseNum = 0;
+ int outputByteNum = outputOffset;
+ int inputCharNum = inputOffset;
+ short inputChar;
+ CodingCase codingCase;
+ for (; inputCharNum < numInputChars - 1; ++inputCharNum) {
+ codingCase = CODING_CASES[caseNum];
+ inputChar = (short) inputArray[inputCharNum];
+ if (2 == codingCase.numBytes) {
+ if (0 == caseNum) {
+ outputArray[outputByteNum] = (byte) (inputChar >>> codingCase.initialShift);
+ } else {
+ outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift);
+ }
+ outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
+ } else { // numBytes is 3
+ outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift);
+ outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.middleMask) >>> codingCase.middleShift);
+ outputArray[outputByteNum + 2] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
+ }
+ outputByteNum += codingCase.advanceBytes;
+ if (++caseNum == CODING_CASES.length) {
+ caseNum = 0;
+ }
+ }
+ // Handle final char
+ inputChar = (short) inputArray[inputCharNum];
+ codingCase = CODING_CASES[caseNum];
+ if (0 == caseNum) {
+ outputArray[outputByteNum] = 0;
+ }
+ outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift);
+ final int bytesLeft = numOutputBytes - outputByteNum;
+ if (bytesLeft > 1) {
+ if (2 == codingCase.numBytes) {
+ outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.finalMask) >>> codingCase.finalShift);
+ } else { // numBytes is 3
+ outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.middleMask) >>> codingCase.middleShift);
+ if (bytesLeft > 2) {
+ outputArray[outputByteNum + 2] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Decodes the given char sequence, which must have been encoded by
+ * {@link #encode(java.nio.ByteBuffer)} or
+ * {@link #encode(java.nio.ByteBuffer, java.nio.CharBuffer)}.
+ *
+ * @param input The char sequence to decode
+ * @return A byte sequence containing the decoding result. The limit is set to
+ * one past the position of the final char.
+ * @throws IllegalArgumentException If the input buffer is not backed by an
+ * array
+ * @deprecated Use {@link #decode(char[], int, int, byte[], int, int)}
+ * instead. This method will be removed in Lucene 4.0
+ */
+ @Deprecated
+ public static ByteBuffer decode(CharBuffer input) {
+ byte[] outputArray = new byte[getDecodedLength(input)];
+ ByteBuffer output = ByteBuffer.wrap(outputArray);
+ decode(input, output);
+ return output;
+ }
+
+ /**
+ * Encodes the input byte sequence.
+ *
+ * @param input The byte sequence to encode
+ * @return A char sequence containing the encoding result. The limit is set to
+ * one past the position of the final char.
+ * @throws IllegalArgumentException If the input buffer is not backed by an
+ * array
+ * @deprecated Use {@link #encode(byte[], int, int, char[], int, int)}
+ * instead. This method will be removed in Lucene 4.0
+ */
+ @Deprecated
+ public static CharBuffer encode(ByteBuffer input) {
+ char[] outputArray = new char[getEncodedLength(input)];
+ CharBuffer output = CharBuffer.wrap(outputArray);
+ encode(input, output);
+ return output;
+ }
+
+ static class CodingCase {
+ int numBytes, initialShift, middleShift, finalShift, advanceBytes = 2;
+ short middleMask, finalMask;
+
+ CodingCase(int initialShift, int middleShift, int finalShift) {
+ this.numBytes = 3;
+ this.initialShift = initialShift;
+ this.middleShift = middleShift;
+ this.finalShift = finalShift;
+ this.finalMask = (short)((short)0xFF >>> finalShift);
+ this.middleMask = (short)((short)0xFF << middleShift);
+ }
+
+ CodingCase(int initialShift, int finalShift) {
+ this.numBytes = 2;
+ this.initialShift = initialShift;
+ this.finalShift = finalShift;
+ this.finalMask = (short)((short)0xFF >>> finalShift);
+ if (finalShift != 0) {
+ advanceBytes = 1;
+ }
+ }
+ }
+}