lucene-java-3.4.0/lucene/src/java/org/apache/lucene/util/IndexableBinaryStringTools.java

   1 package org.apache.lucene.util;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.nio.CharBuffer;
  21 import java.nio.ByteBuffer;
  22
  23 /**
  24  * Provides support for converting byte sequences to Strings and back again.
  25  * The resulting Strings preserve the original byte sequences' sort order.
  26  * <p/>
  27  * The Strings are constructed using a Base 8000h encoding of the original
  28  * binary data - each char of an encoded String represents a 15-bit chunk
  29  * from the byte sequence.  Base 8000h was chosen because it allows for all
  30  * lower 15 bits of char to be used without restriction; the surrogate range
  31  * [U+D8000-U+DFFF] does not represent valid chars, and would require
  32  * complicated handling to avoid them and allow use of char's high bit.
  33  * <p/>
  34  * Although unset bits are used as padding in the final char, the original
  35  * byte sequence could contain trailing bytes with no set bits (null bytes):
  36  * padding is indistinguishable from valid information.  To overcome this
  37  * problem, a char is appended, indicating the number of encoded bytes in the
  38  * final content char.
  39  * <p/>
  40  * Some methods in this class are defined over CharBuffers and ByteBuffers, but
  41  * these are deprecated in favor of methods that operate directly on byte[] and
  42  * char[] arrays.  Note that this class calls array() and arrayOffset()
  43  * on the CharBuffers and ByteBuffers it uses, so only wrapped arrays may be
  44  * used.  This class interprets the arrayOffset() and limit() values returned
  45  * by its input buffers as beginning and end+1 positions on the wrapped array,
  46  * respectively; similarly, on the output buffer, arrayOffset() is the first
  47  * position written to, and limit() is set to one past the final output array
  48  * position.
  49  * <p/>
  50  * WARNING: This means that the deprecated Buffer-based methods
  51  * only work correctly with buffers that have an offset of 0. For example, they
  52  * will not correctly interpret buffers returned by {@link ByteBuffer#slice}.
  53  *
  54  * @lucene.experimental
  55  */
  56 public final class IndexableBinaryStringTools {
  57
  58   private static final CodingCase[] CODING_CASES = {
  59     // CodingCase(int initialShift, int finalShift)
  60     new CodingCase( 7, 1   ),
  61     // CodingCase(int initialShift, int middleShift, int finalShift)
  62     new CodingCase(14, 6, 2),
  63     new CodingCase(13, 5, 3),
  64     new CodingCase(12, 4, 4),
  65     new CodingCase(11, 3, 5),
  66     new CodingCase(10, 2, 6),
  67     new CodingCase( 9, 1, 7),
  68     new CodingCase( 8, 0   )
  69   };
  70
  71   // Export only static methods
  72   private IndexableBinaryStringTools() {}
  73
  74   /**
  75    * Returns the number of chars required to encode the given byte sequence.
  76    *
  77    * @param original The byte sequence to be encoded. Must be backed by an
  78    *        array.
  79    * @return The number of chars required to encode the given byte sequence
  80    * @throws IllegalArgumentException If the given ByteBuffer is not backed by
  81    *         an array
  82    * @deprecated Use {@link #getEncodedLength(byte[], int, int)} instead. This
  83    *             method will be removed in Lucene 4.0
  84    */
  85   @Deprecated
  86   public static int getEncodedLength(ByteBuffer original)
  87     throws IllegalArgumentException {
  88     if (original.hasArray()) {
  89       return getEncodedLength(original.array(), original.arrayOffset(),
  90           original.limit() - original.arrayOffset());
  91     } else {
  92       throw new IllegalArgumentException("original argument must have a backing array");
  93     }
  94   }
  95
  96   /**
  97    * Returns the number of chars required to encode the given bytes.
  98    *
  99    * @param inputArray byte sequence to be encoded
 100    * @param inputOffset initial offset into inputArray
 101    * @param inputLength number of bytes in inputArray
 102    * @return The number of chars required to encode the number of bytes.
 103    */
 104   public static int getEncodedLength(byte[] inputArray, int inputOffset,
 105       int inputLength) {
 106     // Use long for intermediaries to protect against overflow
 107     return (int)((8L * inputLength + 14L) / 15L) + 1;
 108   }
 109
 110
 111   /**
 112    * Returns the number of bytes required to decode the given char sequence.
 113    *
 114    * @param encoded The char sequence to be decoded. Must be backed by an array.
 115    * @return The number of bytes required to decode the given char sequence
 116    * @throws IllegalArgumentException If the given CharBuffer is not backed by
 117    *         an array
 118    * @deprecated Use {@link #getDecodedLength(char[], int, int)} instead. This
 119    *             method will be removed in Lucene 4.0
 120    */
 121   @Deprecated
 122   public static int getDecodedLength(CharBuffer encoded)
 123     throws IllegalArgumentException {
 124     if (encoded.hasArray()) {
 125       return getDecodedLength(encoded.array(), encoded.arrayOffset(),
 126           encoded.limit() - encoded.arrayOffset());
 127     } else {
 128       throw new IllegalArgumentException("encoded argument must have a backing array");
 129     }
 130   }
 131
 132   /**
 133    * Returns the number of bytes required to decode the given char sequence.
 134    *
 135    * @param encoded char sequence to be decoded
 136    * @param offset initial offset
 137    * @param length number of characters
 138    * @return The number of bytes required to decode the given char sequence
 139    */
 140   public static int getDecodedLength(char[] encoded, int offset, int length) {
 141     final int numChars = length - 1;
 142     if (numChars <= 0) {
 143       return 0;
 144     } else {
 145       // Use long for intermediaries to protect against overflow
 146       final long numFullBytesInFinalChar = encoded[offset + length - 1];
 147       final long numEncodedChars = numChars - 1;
 148       return (int)((numEncodedChars * 15L + 7L) / 8L + numFullBytesInFinalChar);
 149     }
 150   }
 151
 152   /**
 153    * Encodes the input byte sequence into the output char sequence. Before
 154    * calling this method, ensure that the output CharBuffer has sufficient
 155    * capacity by calling {@link #getEncodedLength(java.nio.ByteBuffer)}.
 156    *
 157    * @param input The byte sequence to encode
 158    * @param output Where the char sequence encoding result will go. The limit is
 159    *        set to one past the position of the final char.
 160    * @throws IllegalArgumentException If either the input or the output buffer
 161    *         is not backed by an array
 162    * @deprecated Use {@link #encode(byte[], int, int, char[], int, int)}
 163    *             instead. This method will be removed in Lucene 4.0
 164    */
 165   @Deprecated
 166   public static void encode(ByteBuffer input, CharBuffer output) {
 167     if (input.hasArray() && output.hasArray()) {
 168       final int inputOffset = input.arrayOffset();
 169       final int inputLength = input.limit() - inputOffset;
 170       final int outputOffset = output.arrayOffset();
 171       final int outputLength = getEncodedLength(input.array(), inputOffset,
 172           inputLength);
 173       output.limit(outputLength + outputOffset);
 174       output.position(0);
 175       encode(input.array(), inputOffset, inputLength, output.array(),
 176           outputOffset, outputLength);
 177     } else {
 178       throw new IllegalArgumentException("Arguments must have backing arrays");
 179     }
 180   }
 181
 182   /**
 183    * Encodes the input byte sequence into the output char sequence.  Before
 184    * calling this method, ensure that the output array has sufficient
 185    * capacity by calling {@link #getEncodedLength(byte[], int, int)}.
 186    *
 187    * @param inputArray byte sequence to be encoded
 188    * @param inputOffset initial offset into inputArray
 189    * @param inputLength number of bytes in inputArray
 190    * @param outputArray char sequence to store encoded result
 191    * @param outputOffset initial offset into outputArray
 192    * @param outputLength length of output, must be getEncodedLength
 193    */
 194   public static void encode(byte[] inputArray, int inputOffset,
 195       int inputLength, char[] outputArray, int outputOffset, int outputLength) {
 196     assert (outputLength == getEncodedLength(inputArray, inputOffset,
 197         inputLength));
 198     if (inputLength > 0) {
 199       int inputByteNum = inputOffset;
 200       int caseNum = 0;
 201       int outputCharNum = outputOffset;
 202       CodingCase codingCase;
 203       for (; inputByteNum + CODING_CASES[caseNum].numBytes <= inputLength; ++outputCharNum) {
 204         codingCase = CODING_CASES[caseNum];
 205         if (2 == codingCase.numBytes) {
 206           outputArray[outputCharNum] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift)
 207               + (((inputArray[inputByteNum + 1] & 0xFF) >>> codingCase.finalShift) & codingCase.finalMask) & (short) 0x7FFF);
 208         } else { // numBytes is 3
 209           outputArray[outputCharNum] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift)
 210               + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift)
 211               + (((inputArray[inputByteNum + 2] & 0xFF) >>> codingCase.finalShift) & codingCase.finalMask) & (short) 0x7FFF);
 212         }
 213         inputByteNum += codingCase.advanceBytes;
 214         if (++caseNum == CODING_CASES.length) {
 215           caseNum = 0;
 216         }
 217       }
 218       // Produce final char (if any) and trailing count chars.
 219       codingCase = CODING_CASES[caseNum];
 220
 221       if (inputByteNum + 1 < inputLength) { // codingCase.numBytes must be 3
 222         outputArray[outputCharNum++] = (char) ((((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift)) & (short) 0x7FFF);
 223         // Add trailing char containing the number of full bytes in final char
 224         outputArray[outputCharNum++] = (char) 1;
 225       } else if (inputByteNum < inputLength) {
 226         outputArray[outputCharNum++] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) & (short) 0x7FFF);
 227         // Add trailing char containing the number of full bytes in final char
 228         outputArray[outputCharNum++] = caseNum == 0 ? (char) 1 : (char) 0;
 229       } else { // No left over bits - last char is completely filled.
 230         // Add trailing char containing the number of full bytes in final char
 231         outputArray[outputCharNum++] = (char) 1;
 232       }
 233     }
 234   }
 235
 236   /**
 237    * Decodes the input char sequence into the output byte sequence. Before
 238    * calling this method, ensure that the output ByteBuffer has sufficient
 239    * capacity by calling {@link #getDecodedLength(java.nio.CharBuffer)}.
 240    *
 241    * @param input The char sequence to decode
 242    * @param output Where the byte sequence decoding result will go. The limit is
 243    *        set to one past the position of the final char.
 244    * @throws IllegalArgumentException If either the input or the output buffer
 245    *         is not backed by an array
 246    * @deprecated Use {@link #decode(char[], int, int, byte[], int, int)}
 247    *             instead. This method will be removed in Lucene 4.0
 248    */
 249   @Deprecated
 250   public static void decode(CharBuffer input, ByteBuffer output) {
 251     if (input.hasArray() && output.hasArray()) {
 252       final int inputOffset = input.arrayOffset();
 253       final int inputLength = input.limit() - inputOffset;
 254       final int outputOffset = output.arrayOffset();
 255       final int outputLength = getDecodedLength(input.array(), inputOffset,
 256           inputLength);
 257       output.limit(outputLength + outputOffset);
 258       output.position(0);
 259       decode(input.array(), inputOffset, inputLength, output.array(),
 260           outputOffset, outputLength);
 261     } else {
 262       throw new IllegalArgumentException("Arguments must have backing arrays");
 263     }
 264   }
 265
 266   /**
 267    * Decodes the input char sequence into the output byte sequence. Before
 268    * calling this method, ensure that the output array has sufficient capacity
 269    * by calling {@link #getDecodedLength(char[], int, int)}.
 270    *
 271    * @param inputArray char sequence to be decoded
 272    * @param inputOffset initial offset into inputArray
 273    * @param inputLength number of chars in inputArray
 274    * @param outputArray byte sequence to store encoded result
 275    * @param outputOffset initial offset into outputArray
 276    * @param outputLength length of output, must be
 277    *        getDecodedLength(inputArray, inputOffset, inputLength)
 278    */
 279   public static void decode(char[] inputArray, int inputOffset,
 280       int inputLength, byte[] outputArray, int outputOffset, int outputLength) {
 281     assert (outputLength == getDecodedLength(inputArray, inputOffset,
 282         inputLength));
 283     final int numInputChars = inputLength - 1;
 284     final int numOutputBytes = outputLength;
 285
 286     if (numOutputBytes > 0) {
 287       int caseNum = 0;
 288       int outputByteNum = outputOffset;
 289       int inputCharNum = inputOffset;
 290       short inputChar;
 291       CodingCase codingCase;
 292       for (; inputCharNum < numInputChars - 1; ++inputCharNum) {
 293         codingCase = CODING_CASES[caseNum];
 294         inputChar = (short) inputArray[inputCharNum];
 295         if (2 == codingCase.numBytes) {
 296           if (0 == caseNum) {
 297             outputArray[outputByteNum] = (byte) (inputChar >>> codingCase.initialShift);
 298           } else {
 299             outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift);
 300           }
 301           outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
 302         } else { // numBytes is 3
 303           outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift);
 304           outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.middleMask) >>> codingCase.middleShift);
 305           outputArray[outputByteNum + 2] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
 306         }
 307         outputByteNum += codingCase.advanceBytes;
 308         if (++caseNum == CODING_CASES.length) {
 309           caseNum = 0;
 310         }
 311       }
 312       // Handle final char
 313       inputChar = (short) inputArray[inputCharNum];
 314       codingCase = CODING_CASES[caseNum];
 315       if (0 == caseNum) {
 316         outputArray[outputByteNum] = 0;
 317       }
 318       outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift);
 319       final int bytesLeft = numOutputBytes - outputByteNum;
 320       if (bytesLeft > 1) {
 321         if (2 == codingCase.numBytes) {
 322           outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.finalMask) >>> codingCase.finalShift);
 323         } else { // numBytes is 3
 324           outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.middleMask) >>> codingCase.middleShift);
 325           if (bytesLeft > 2) {
 326             outputArray[outputByteNum + 2] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
 327           }
 328         }
 329       }
 330     }
 331   }
 332
 333   /**
 334    * Decodes the given char sequence, which must have been encoded by
 335    * {@link #encode(java.nio.ByteBuffer)} or
 336    * {@link #encode(java.nio.ByteBuffer, java.nio.CharBuffer)}.
 337    *
 338    * @param input The char sequence to decode
 339    * @return A byte sequence containing the decoding result. The limit is set to
 340    *         one past the position of the final char.
 341    * @throws IllegalArgumentException If the input buffer is not backed by an
 342    *         array
 343    * @deprecated Use {@link #decode(char[], int, int, byte[], int, int)}
 344    *             instead. This method will be removed in Lucene 4.0
 345    */
 346   @Deprecated
 347   public static ByteBuffer decode(CharBuffer input) {
 348     byte[] outputArray = new byte[getDecodedLength(input)];
 349     ByteBuffer output = ByteBuffer.wrap(outputArray);
 350     decode(input, output);
 351     return output;
 352   }
 353
 354   /**
 355    * Encodes the input byte sequence.
 356    *
 357    * @param input The byte sequence to encode
 358    * @return A char sequence containing the encoding result. The limit is set to
 359    *         one past the position of the final char.
 360    * @throws IllegalArgumentException If the input buffer is not backed by an
 361    *         array
 362    * @deprecated Use {@link #encode(byte[], int, int, char[], int, int)}
 363    *             instead. This method will be removed in Lucene 4.0
 364    */
 365   @Deprecated
 366   public static CharBuffer encode(ByteBuffer input) {
 367     char[] outputArray = new char[getEncodedLength(input)];
 368     CharBuffer output = CharBuffer.wrap(outputArray);
 369     encode(input, output);
 370     return output;
 371   }
 372
 373   static class CodingCase {
 374     int numBytes, initialShift, middleShift, finalShift, advanceBytes = 2;
 375     short middleMask, finalMask;
 376
 377     CodingCase(int initialShift, int middleShift, int finalShift) {
 378       this.numBytes = 3;
 379       this.initialShift = initialShift;
 380       this.middleShift = middleShift;
 381       this.finalShift = finalShift;
 382       this.finalMask = (short)((short)0xFF >>> finalShift);
 383       this.middleMask = (short)((short)0xFF << middleShift);
 384     }
 385
 386     CodingCase(int initialShift, int finalShift) {
 387       this.numBytes = 2;
 388       this.initialShift = initialShift;
 389       this.finalShift = finalShift;
 390       this.finalMask = (short)((short)0xFF >>> finalShift);
 391       if (finalShift != 0) {
 392         advanceBytes = 1;
 393       }
 394     }
 395   }
 396 }