lucene-java-3.4.0/lucene/src/java/org/apache/lucene/util/UnicodeUtil.java

   1 package org.apache.lucene.util;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20
  21 /*
  22  * Some of this code came from the excellent Unicode
  23  * conversion examples from:
  24  *
  25  *   http://www.unicode.org/Public/PROGRAMS/CVTUTF
  26  *
  27  * Full Copyright for that code follows:
  28 */
  29
  30 /*
  31  * Copyright 2001-2004 Unicode, Inc.
  32  *
  33  * Disclaimer
  34  *
  35  * This source code is provided as is by Unicode, Inc. No claims are
  36  * made as to fitness for any particular purpose. No warranties of any
  37  * kind are expressed or implied. The recipient agrees to determine
  38  * applicability of information provided. If this file has been
  39  * purchased on magnetic or optical media from Unicode, Inc., the
  40  * sole remedy for any claim will be exchange of defective media
  41  * within 90 days of receipt.
  42  *
  43  * Limitations on Rights to Redistribute This Code
  44  *
  45  * Unicode, Inc. hereby grants the right to freely use the information
  46  * supplied in this file in the creation of products supporting the
  47  * Unicode Standard, and to make copies of this file in any form
  48  * for internal or external distribution as long as this notice
  49  * remains attached.
  50  */
  51
  52 /*
  53  * Additional code came from the IBM ICU library.
  54  *
  55  *  http://www.icu-project.org
  56  *
  57  * Full Copyright for that code follows.
  58  */
  59
  60 /*
  61  * Copyright (C) 1999-2010, International Business Machines
  62  * Corporation and others.  All Rights Reserved.
  63  *
  64  * Permission is hereby granted, free of charge, to any person obtaining a copy
  65  * of this software and associated documentation files (the "Software"), to deal
  66  * in the Software without restriction, including without limitation the rights
  67  * to use, copy, modify, merge, publish, distribute, and/or sell copies of the
  68  * Software, and to permit persons to whom the Software is furnished to do so,
  69  * provided that the above copyright notice(s) and this permission notice appear
  70  * in all copies of the Software and that both the above copyright notice(s) and
  71  * this permission notice appear in supporting documentation.
  72  *
  73  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  74  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  75  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
  76  * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
  77  * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
  78  * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
  79  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  80  * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  81  *
  82  * Except as contained in this notice, the name of a copyright holder shall not
  83  * be used in advertising or otherwise to promote the sale, use or other
  84  * dealings in this Software without prior written authorization of the
  85  * copyright holder.
  86  */
  87
  88 /**
  89  * Class to encode java's UTF16 char[] into UTF8 byte[]
  90  * without always allocating a new byte[] as
  91  * String.getBytes("UTF-8") does.
  92  *
  93  * @lucene.internal
  94  */
  95
  96 public final class UnicodeUtil {
  97
  98   private UnicodeUtil() {} // no instance
  99
 100   public static final int UNI_SUR_HIGH_START = 0xD800;
 101   public static final int UNI_SUR_HIGH_END = 0xDBFF;
 102   public static final int UNI_SUR_LOW_START = 0xDC00;
 103   public static final int UNI_SUR_LOW_END = 0xDFFF;
 104   public static final int UNI_REPLACEMENT_CHAR = 0xFFFD;
 105
 106   private static final long UNI_MAX_BMP = 0x0000FFFF;
 107
 108   private static final int HALF_BASE = 0x0010000;
 109   private static final long HALF_SHIFT = 10;
 110   private static final long HALF_MASK = 0x3FFL;
 111
 112   private static final int SURROGATE_OFFSET =
 113     Character.MIN_SUPPLEMENTARY_CODE_POINT -
 114     (UNI_SUR_HIGH_START << HALF_SHIFT) - UNI_SUR_LOW_START;
 115
 116   /**
 117    * @lucene.internal
 118    */
 119   public static final class UTF8Result {
 120     public byte[] result = new byte[10];
 121     public int length;
 122
 123     public void setLength(int newLength) {
 124       if (result.length < newLength) {
 125         result = ArrayUtil.grow(result, newLength);
 126       }
 127       length = newLength;
 128     }
 129   }
 130
 131   /**
 132    * @lucene.internal
 133    */
 134   public static final class UTF16Result {
 135     public char[] result = new char[10];
 136     public int[] offsets = new int[10];
 137     public int length;
 138
 139     public void setLength(int newLength) {
 140       if (result.length < newLength) {
 141         result = ArrayUtil.grow(result, newLength);
 142       }
 143       length = newLength;
 144     }
 145
 146     public void copyText(UTF16Result other) {
 147       setLength(other.length);
 148       System.arraycopy(other.result, 0, result, 0, length);
 149     }
 150   }
 151
 152   /** Encode characters from a char[] source, starting at
 153    *  offset for length chars.  Returns a hash of the resulting bytes.  After encoding, result.offset will always be 0. */
 154   public static int UTF16toUTF8WithHash(final char[] source, final int offset, final int length, BytesRef result) {
 155     int hash = 0;
 156     int upto = 0;
 157     int i = offset;
 158     final int end = offset + length;
 159     byte[] out = result.bytes;
 160     // Pre-allocate for worst case 4-for-1
 161     final int maxLen = length * 4;
 162     if (out.length < maxLen)
 163       out = result.bytes = new byte[ArrayUtil.oversize(maxLen, 1)];
 164     result.offset = 0;
 165
 166     while(i < end) {
 167
 168       final int code = (int) source[i++];
 169
 170       if (code < 0x80) {
 171         hash = 31*hash + (out[upto++] = (byte) code);
 172       } else if (code < 0x800) {
 173         hash = 31*hash + (out[upto++] = (byte) (0xC0 | (code >> 6)));
 174         hash = 31*hash + (out[upto++] = (byte)(0x80 | (code & 0x3F)));
 175       } else if (code < 0xD800 || code > 0xDFFF) {
 176         hash = 31*hash + (out[upto++] = (byte)(0xE0 | (code >> 12)));
 177         hash = 31*hash + (out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F)));
 178         hash = 31*hash + (out[upto++] = (byte)(0x80 | (code & 0x3F)));
 179       } else {
 180         // surrogate pair
 181         // confirm valid high surrogate
 182         if (code < 0xDC00 && i < end) {
 183           int utf32 = (int) source[i];
 184           // confirm valid low surrogate and write pair
 185           if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
 186             utf32 = (code << 10) + utf32 + SURROGATE_OFFSET;
 187             i++;
 188             hash = 31*hash + (out[upto++] = (byte)(0xF0 | (utf32 >> 18)));
 189             hash = 31*hash + (out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F)));
 190             hash = 31*hash + (out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F)));
 191             hash = 31*hash + (out[upto++] = (byte)(0x80 | (utf32 & 0x3F)));
 192             continue;
 193           }
 194         }
 195         // replace unpaired surrogate or out-of-order low surrogate
 196         // with substitution character
 197         hash = 31*hash + (out[upto++] = (byte) 0xEF);
 198         hash = 31*hash + (out[upto++] = (byte) 0xBF);
 199         hash = 31*hash + (out[upto++] = (byte) 0xBD);
 200       }
 201     }
 202     //assert matches(source, offset, length, out, upto);
 203     result.length = upto;
 204     return hash;
 205   }
 206
 207   /** Encode characters from a char[] source, starting at
 208    *  offset and stopping when the character 0xffff is seen.
 209    *  Returns the number of bytes written to bytesOut. */
 210   public static void UTF16toUTF8(final char[] source, final int offset, UTF8Result result) {
 211
 212     int upto = 0;
 213     int i = offset;
 214     byte[] out = result.result;
 215
 216     while(true) {
 217
 218       final int code = (int) source[i++];
 219
 220       if (upto+4 > out.length) {
 221         out = result.result = ArrayUtil.grow(out, upto+4);
 222       }
 223       if (code < 0x80)
 224         out[upto++] = (byte) code;
 225       else if (code < 0x800) {
 226         out[upto++] = (byte) (0xC0 | (code >> 6));
 227         out[upto++] = (byte)(0x80 | (code & 0x3F));
 228       } else if (code < 0xD800 || code > 0xDFFF) {
 229         if (code == 0xffff)
 230           // END
 231           break;
 232         out[upto++] = (byte)(0xE0 | (code >> 12));
 233         out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
 234         out[upto++] = (byte)(0x80 | (code & 0x3F));
 235       } else {
 236         // surrogate pair
 237         // confirm valid high surrogate
 238         if (code < 0xDC00 && source[i] != 0xffff) {
 239           int utf32 = (int) source[i];
 240           // confirm valid low surrogate and write pair
 241           if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
 242             utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
 243             i++;
 244             out[upto++] = (byte)(0xF0 | (utf32 >> 18));
 245             out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
 246             out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
 247             out[upto++] = (byte)(0x80 | (utf32 & 0x3F));
 248             continue;
 249           }
 250         }
 251         // replace unpaired surrogate or out-of-order low surrogate
 252         // with substitution character
 253         out[upto++] = (byte) 0xEF;
 254         out[upto++] = (byte) 0xBF;
 255         out[upto++] = (byte) 0xBD;
 256       }
 257     }
 258     //assert matches(source, offset, i-offset-1, out, upto);
 259     result.length = upto;
 260   }
 261
 262   /** Encode characters from a char[] source, starting at
 263    *  offset for length chars.  Returns the number of bytes
 264    *  written to bytesOut. */
 265   public static void UTF16toUTF8(final char[] source, final int offset, final int length, UTF8Result result) {
 266
 267     int upto = 0;
 268     int i = offset;
 269     final int end = offset + length;
 270     byte[] out = result.result;
 271
 272     while(i < end) {
 273
 274       final int code = (int) source[i++];
 275
 276       if (upto+4 > out.length) {
 277         out = result.result = ArrayUtil.grow(out, upto+4);
 278       }
 279       if (code < 0x80)
 280         out[upto++] = (byte) code;
 281       else if (code < 0x800) {
 282         out[upto++] = (byte) (0xC0 | (code >> 6));
 283         out[upto++] = (byte)(0x80 | (code & 0x3F));
 284       } else if (code < 0xD800 || code > 0xDFFF) {
 285         out[upto++] = (byte)(0xE0 | (code >> 12));
 286         out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
 287         out[upto++] = (byte)(0x80 | (code & 0x3F));
 288       } else {
 289         // surrogate pair
 290         // confirm valid high surrogate
 291         if (code < 0xDC00 && i < end && source[i] != 0xffff) {
 292           int utf32 = (int) source[i];
 293           // confirm valid low surrogate and write pair
 294           if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
 295             utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
 296             i++;
 297             out[upto++] = (byte)(0xF0 | (utf32 >> 18));
 298             out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
 299             out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
 300             out[upto++] = (byte)(0x80 | (utf32 & 0x3F));
 301             continue;
 302           }
 303         }
 304         // replace unpaired surrogate or out-of-order low surrogate
 305         // with substitution character
 306         out[upto++] = (byte) 0xEF;
 307         out[upto++] = (byte) 0xBF;
 308         out[upto++] = (byte) 0xBD;
 309       }
 310     }
 311     //assert matches(source, offset, length, out, upto);
 312     result.length = upto;
 313   }
 314
 315   /** Encode characters from this String, starting at offset
 316    *  for length characters.  Returns the number of bytes
 317    *  written to bytesOut. */
 318   public static void UTF16toUTF8(final String s, final int offset, final int length, UTF8Result result) {
 319     final int end = offset + length;
 320
 321     byte[] out = result.result;
 322
 323     int upto = 0;
 324     for(int i=offset;i<end;i++) {
 325       final int code = (int) s.charAt(i);
 326
 327       if (upto+4 > out.length) {
 328         out = result.result = ArrayUtil.grow(out, upto+4);
 329       }
 330       if (code < 0x80)
 331         out[upto++] = (byte) code;
 332       else if (code < 0x800) {
 333         out[upto++] = (byte) (0xC0 | (code >> 6));
 334         out[upto++] = (byte)(0x80 | (code & 0x3F));
 335       } else if (code < 0xD800 || code > 0xDFFF) {
 336         out[upto++] = (byte)(0xE0 | (code >> 12));
 337         out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
 338         out[upto++] = (byte)(0x80 | (code & 0x3F));
 339       } else {
 340         // surrogate pair
 341         // confirm valid high surrogate
 342         if (code < 0xDC00 && (i < end-1)) {
 343           int utf32 = (int) s.charAt(i+1);
 344           // confirm valid low surrogate and write pair
 345           if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
 346             utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
 347             i++;
 348             out[upto++] = (byte)(0xF0 | (utf32 >> 18));
 349             out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
 350             out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
 351             out[upto++] = (byte)(0x80 | (utf32 & 0x3F));
 352             continue;
 353           }
 354         }
 355         // replace unpaired surrogate or out-of-order low surrogate
 356         // with substitution character
 357         out[upto++] = (byte) 0xEF;
 358         out[upto++] = (byte) 0xBF;
 359         out[upto++] = (byte) 0xBD;
 360       }
 361     }
 362     //assert matches(s, offset, length, out, upto);
 363     result.length = upto;
 364   }
 365
 366   /** Encode characters from this String, starting at offset
 367    *  for length characters. After encoding, result.offset will always be 0.
 368    */
 369   public static void UTF16toUTF8(final CharSequence s, final int offset, final int length, BytesRef result) {
 370     final int end = offset + length;
 371
 372     byte[] out = result.bytes;
 373     result.offset = 0;
 374     // Pre-allocate for worst case 4-for-1
 375     final int maxLen = length * 4;
 376     if (out.length < maxLen)
 377       out = result.bytes = new byte[maxLen];
 378
 379     int upto = 0;
 380     for(int i=offset;i<end;i++) {
 381       final int code = (int) s.charAt(i);
 382
 383       if (code < 0x80)
 384         out[upto++] = (byte) code;
 385       else if (code < 0x800) {
 386         out[upto++] = (byte) (0xC0 | (code >> 6));
 387         out[upto++] = (byte)(0x80 | (code & 0x3F));
 388       } else if (code < 0xD800 || code > 0xDFFF) {
 389         out[upto++] = (byte)(0xE0 | (code >> 12));
 390         out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
 391         out[upto++] = (byte)(0x80 | (code & 0x3F));
 392       } else {
 393         // surrogate pair
 394         // confirm valid high surrogate
 395         if (code < 0xDC00 && (i < end-1)) {
 396           int utf32 = (int) s.charAt(i+1);
 397           // confirm valid low surrogate and write pair
 398           if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
 399             utf32 = (code << 10) + utf32 + SURROGATE_OFFSET;
 400             i++;
 401             out[upto++] = (byte)(0xF0 | (utf32 >> 18));
 402             out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
 403             out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
 404             out[upto++] = (byte)(0x80 | (utf32 & 0x3F));
 405             continue;
 406           }
 407         }
 408         // replace unpaired surrogate or out-of-order low surrogate
 409         // with substitution character
 410         out[upto++] = (byte) 0xEF;
 411         out[upto++] = (byte) 0xBF;
 412         out[upto++] = (byte) 0xBD;
 413       }
 414     }
 415     //assert matches(s, offset, length, out, upto);
 416     result.length = upto;
 417   }
 418
 419   /** Encode characters from a char[] source, starting at
 420    *  offset for length chars. After encoding, result.offset will always be 0.
 421    */
 422   public static void UTF16toUTF8(final char[] source, final int offset, final int length, BytesRef result) {
 423
 424     int upto = 0;
 425     int i = offset;
 426     final int end = offset + length;
 427     byte[] out = result.bytes;
 428     // Pre-allocate for worst case 4-for-1
 429     final int maxLen = length * 4;
 430     if (out.length < maxLen)
 431       out = result.bytes = new byte[maxLen];
 432     result.offset = 0;
 433
 434     while(i < end) {
 435
 436       final int code = (int) source[i++];
 437
 438       if (code < 0x80)
 439         out[upto++] = (byte) code;
 440       else if (code < 0x800) {
 441         out[upto++] = (byte) (0xC0 | (code >> 6));
 442         out[upto++] = (byte)(0x80 | (code & 0x3F));
 443       } else if (code < 0xD800 || code > 0xDFFF) {
 444         out[upto++] = (byte)(0xE0 | (code >> 12));
 445         out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
 446         out[upto++] = (byte)(0x80 | (code & 0x3F));
 447       } else {
 448         // surrogate pair
 449         // confirm valid high surrogate
 450         if (code < 0xDC00 && i < end) {
 451           int utf32 = (int) source[i];
 452           // confirm valid low surrogate and write pair
 453           if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
 454             utf32 = (code << 10) + utf32 + SURROGATE_OFFSET;
 455             i++;
 456             out[upto++] = (byte)(0xF0 | (utf32 >> 18));
 457             out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
 458             out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
 459             out[upto++] = (byte)(0x80 | (utf32 & 0x3F));
 460             continue;
 461           }
 462         }
 463         // replace unpaired surrogate or out-of-order low surrogate
 464         // with substitution character
 465         out[upto++] = (byte) 0xEF;
 466         out[upto++] = (byte) 0xBF;
 467         out[upto++] = (byte) 0xBD;
 468       }
 469     }
 470     //assert matches(source, offset, length, out, upto);
 471     result.length = upto;
 472   }
 473
 474   /** Convert UTF8 bytes into UTF16 characters.  If offset
 475    *  is non-zero, conversion starts at that starting point
 476    *  in utf8, re-using the results from the previous call
 477    *  up until offset. */
 478   public static void UTF8toUTF16(final byte[] utf8, final int offset, final int length, final UTF16Result result) {
 479
 480     final int end = offset + length;
 481     char[] out = result.result;
 482     if (result.offsets.length <= end) {
 483       result.offsets = ArrayUtil.grow(result.offsets, end+1);
 484     }
 485     final int[] offsets = result.offsets;
 486
 487     // If incremental decoding fell in the middle of a
 488     // single unicode character, rollback to its start:
 489     int upto = offset;
 490     while(offsets[upto] == -1)
 491       upto--;
 492
 493     int outUpto = offsets[upto];
 494
 495     // Pre-allocate for worst case 1-for-1
 496     if (outUpto+length >= out.length) {
 497       out = result.result = ArrayUtil.grow(out, outUpto+length+1);
 498     }
 499
 500     while (upto < end) {
 501
 502       final int b = utf8[upto]&0xff;
 503       final int ch;
 504
 505       offsets[upto++] = outUpto;
 506
 507       if (b < 0xc0) {
 508         assert b < 0x80;
 509         ch = b;
 510       } else if (b < 0xe0) {
 511         ch = ((b&0x1f)<<6) + (utf8[upto]&0x3f);
 512         offsets[upto++] = -1;
 513       } else if (b < 0xf0) {
 514         ch = ((b&0xf)<<12) + ((utf8[upto]&0x3f)<<6) + (utf8[upto+1]&0x3f);
 515         offsets[upto++] = -1;
 516         offsets[upto++] = -1;
 517       } else {
 518         assert b < 0xf8;
 519         ch = ((b&0x7)<<18) + ((utf8[upto]&0x3f)<<12) + ((utf8[upto+1]&0x3f)<<6) + (utf8[upto+2]&0x3f);
 520         offsets[upto++] = -1;
 521         offsets[upto++] = -1;
 522         offsets[upto++] = -1;
 523       }
 524
 525       if (ch <= UNI_MAX_BMP) {
 526         // target is a character <= 0xFFFF
 527         out[outUpto++] = (char) ch;
 528       } else {
 529         // target is a character in range 0xFFFF - 0x10FFFF
 530         final int chHalf = ch - HALF_BASE;
 531         out[outUpto++] = (char) ((chHalf >> HALF_SHIFT) + UNI_SUR_HIGH_START);
 532         out[outUpto++] = (char) ((chHalf & HALF_MASK) + UNI_SUR_LOW_START);
 533       }
 534     }
 535
 536     offsets[upto] = outUpto;
 537     result.length = outUpto;
 538   }
 539
 540   // Only called from assert
 541   /*
 542   private static boolean matches(char[] source, int offset, int length, byte[] result, int upto) {
 543     try {
 544       String s1 = new String(source, offset, length);
 545       String s2 = new String(result, 0, upto, "UTF-8");
 546       if (!s1.equals(s2)) {
 547         //System.out.println("DIFF: s1 len=" + s1.length());
 548         //for(int i=0;i<s1.length();i++)
 549         //  System.out.println("    " + i + ": " + (int) s1.charAt(i));
 550         //System.out.println("s2 len=" + s2.length());
 551         //for(int i=0;i<s2.length();i++)
 552         //  System.out.println("    " + i + ": " + (int) s2.charAt(i));
 553
 554         // If the input string was invalid, then the
 555         // difference is OK
 556         if (!validUTF16String(s1))
 557           return true;
 558
 559         return false;
 560       }
 561       return s1.equals(s2);
 562     } catch (UnsupportedEncodingException uee) {
 563       return false;
 564     }
 565   }
 566
 567   // Only called from assert
 568   private static boolean matches(String source, int offset, int length, byte[] result, int upto) {
 569     try {
 570       String s1 = source.substring(offset, offset+length);
 571       String s2 = new String(result, 0, upto, "UTF-8");
 572       if (!s1.equals(s2)) {
 573         // Allow a difference if s1 is not valid UTF-16
 574
 575         //System.out.println("DIFF: s1 len=" + s1.length());
 576         //for(int i=0;i<s1.length();i++)
 577         //  System.out.println("    " + i + ": " + (int) s1.charAt(i));
 578         //System.out.println("  s2 len=" + s2.length());
 579         //for(int i=0;i<s2.length();i++)
 580         //  System.out.println("    " + i + ": " + (int) s2.charAt(i));
 581
 582         // If the input string was invalid, then the
 583         // difference is OK
 584         if (!validUTF16String(s1))
 585           return true;
 586
 587         return false;
 588       }
 589       return s1.equals(s2);
 590     } catch (UnsupportedEncodingException uee) {
 591       return false;
 592     }
 593   }
 594
 595   public static final boolean validUTF16String(String s) {
 596     final int size = s.length();
 597     for(int i=0;i<size;i++) {
 598       char ch = s.charAt(i);
 599       if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
 600         if (i < size-1) {
 601           i++;
 602           char nextCH = s.charAt(i);
 603           if (nextCH >= UNI_SUR_LOW_START && nextCH <= UNI_SUR_LOW_END) {
 604             // Valid surrogate pair
 605           } else
 606             // Unmatched high surrogate
 607             return false;
 608         } else
 609           // Unmatched high surrogate
 610           return false;
 611       } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
 612         // Unmatched low surrogate
 613         return false;
 614     }
 615
 616     return true;
 617   }
 618
 619   public static final boolean validUTF16String(char[] s, int size) {
 620     for(int i=0;i<size;i++) {
 621       char ch = s[i];
 622       if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
 623         if (i < size-1) {
 624           i++;
 625           char nextCH = s[i];
 626           if (nextCH >= UNI_SUR_LOW_START && nextCH <= UNI_SUR_LOW_END) {
 627             // Valid surrogate pair
 628           } else
 629             return false;
 630         } else
 631           return false;
 632       } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
 633         // Unmatched low surrogate
 634         return false;
 635     }
 636
 637     return true;
 638   }
 639   */
 640
 641   /** Shift value for lead surrogate to form a supplementary character. */
 642   private static final int LEAD_SURROGATE_SHIFT_ = 10;
 643   /** Mask to retrieve the significant value from a trail surrogate.*/
 644   private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
 645   /** Trail surrogate minimum value */
 646   private static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
 647   /** Lead surrogate minimum value */
 648   private static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
 649   /** The minimum value for Supplementary code points */
 650   private static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
 651   /** Value that all lead surrogate starts with */
 652   private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE
 653           - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
 654
 655   /**
 656    * Cover JDK 1.5 API. Create a String from an array of codePoints.
 657    *
 658    * @param codePoints The code array
 659    * @param offset The start of the text in the code point array
 660    * @param count The number of code points
 661    * @return a String representing the code points between offset and count
 662    * @throws IllegalArgumentException If an invalid code point is encountered
 663    * @throws IndexOutOfBoundsException If the offset or count are out of bounds.
 664    */
 665   public static String newString(int[] codePoints, int offset, int count) {
 666       if (count < 0) {
 667           throw new IllegalArgumentException();
 668       }
 669       char[] chars = new char[count];
 670       int w = 0;
 671       for (int r = offset, e = offset + count; r < e; ++r) {
 672           int cp = codePoints[r];
 673           if (cp < 0 || cp > 0x10ffff) {
 674               throw new IllegalArgumentException();
 675           }
 676           while (true) {
 677               try {
 678                   if (cp < 0x010000) {
 679                       chars[w] = (char) cp;
 680                       w++;
 681                   } else {
 682                       chars[w] = (char) (LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_));
 683                       chars[w + 1] = (char) (TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_));
 684                       w += 2;
 685                   }
 686                   break;
 687               } catch (IndexOutOfBoundsException ex) {
 688                   int newlen = (int) (Math.ceil((double) codePoints.length * (w + 2)
 689                           / (r - offset + 1)));
 690                   char[] temp = new char[newlen];
 691                   System.arraycopy(chars, 0, temp, 0, w);
 692                   chars = temp;
 693               }
 694           }
 695       }
 696       return new String(chars, 0, w);
 697   }
 698
 699   /**
 700    * Interprets the given byte array as UTF-8 and converts to UTF-16. The {@link CharsRef} will be extended if
 701    * it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint.
 702    * <p>
 703    * NOTE: Full characters are read, even if this reads past the length passed (and
 704    * can result in an ArrayOutOfBoundsException if invalid UTF-8 is passed).
 705    * Explicit checks for valid UTF-8 are not performed.
 706    */
 707   public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars) {
 708     int out_offset = chars.offset = 0;
 709     final char[] out = chars.chars =  ArrayUtil.grow(chars.chars, length);
 710     final int limit = offset + length;
 711     while (offset < limit) {
 712       int b = utf8[offset++]&0xff;
 713       if (b < 0xc0) {
 714         assert b < 0x80;
 715         out[out_offset++] = (char)b;
 716       } else if (b < 0xe0) {
 717         out[out_offset++] = (char)(((b&0x1f)<<6) + (utf8[offset++]&0x3f));
 718       } else if (b < 0xf0) {
 719         out[out_offset++] = (char)(((b&0xf)<<12) + ((utf8[offset]&0x3f)<<6) + (utf8[offset+1]&0x3f));
 720         offset += 2;
 721       } else {
 722         assert b < 0xf8;
 723         int ch = ((b&0x7)<<18) + ((utf8[offset]&0x3f)<<12) + ((utf8[offset+1]&0x3f)<<6) + (utf8[offset+2]&0x3f);
 724         offset += 3;
 725         if (ch < UNI_MAX_BMP) {
 726           out[out_offset++] = (char)ch;
 727         } else {
 728           int chHalf = ch - 0x0010000;
 729           out[out_offset++] = (char) ((chHalf >> 10) + 0xD800);
 730           out[out_offset++] = (char) ((chHalf & HALF_MASK) + 0xDC00);
 731         }
 732       }
 733     }
 734     chars.length = out_offset - chars.offset;
 735   }
 736
 737   /**
 738    * Utility method for {@link #UTF8toUTF16(byte[], int, int, CharsRef)}
 739    * @see #UTF8toUTF16(byte[], int, int, CharsRef)
 740    */
 741   public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars) {
 742     UTF8toUTF16(bytesRef.bytes, bytesRef.offset, bytesRef.length, chars);
 743   }
 744 }