1 package org.apache.lucene.util;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
22 * Some of this code came from the excellent Unicode
23 * conversion examples from:
25 * http://www.unicode.org/Public/PROGRAMS/CVTUTF
27 * Full Copyright for that code follows:
31 * Copyright 2001-2004 Unicode, Inc.
35 * This source code is provided as is by Unicode, Inc. No claims are
36 * made as to fitness for any particular purpose. No warranties of any
37 * kind are expressed or implied. The recipient agrees to determine
38 * applicability of information provided. If this file has been
39 * purchased on magnetic or optical media from Unicode, Inc., the
40 * sole remedy for any claim will be exchange of defective media
41 * within 90 days of receipt.
43 * Limitations on Rights to Redistribute This Code
45 * Unicode, Inc. hereby grants the right to freely use the information
46 * supplied in this file in the creation of products supporting the
47 * Unicode Standard, and to make copies of this file in any form
48 * for internal or external distribution as long as this notice
53 * Additional code came from the IBM ICU library.
55 * http://www.icu-project.org
57 * Full Copyright for that code follows.
61 * Copyright (C) 1999-2010, International Business Machines
62 * Corporation and others. All Rights Reserved.
64 * Permission is hereby granted, free of charge, to any person obtaining a copy
65 * of this software and associated documentation files (the "Software"), to deal
66 * in the Software without restriction, including without limitation the rights
67 * to use, copy, modify, merge, publish, distribute, and/or sell copies of the
68 * Software, and to permit persons to whom the Software is furnished to do so,
69 * provided that the above copyright notice(s) and this permission notice appear
70 * in all copies of the Software and that both the above copyright notice(s) and
71 * this permission notice appear in supporting documentation.
73 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
74 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
75 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
76 * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
77 * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
78 * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
79 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
80 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
82 * Except as contained in this notice, the name of a copyright holder shall not
83 * be used in advertising or otherwise to promote the sale, use or other
84 * dealings in this Software without prior written authorization of the
89 * Class to encode java's UTF16 char[] into UTF8 byte[]
90 * without always allocating a new byte[] as
91 * String.getBytes("UTF-8") does.
96 public final class UnicodeUtil {
98 private UnicodeUtil() {} // no instance
100 public static final int UNI_SUR_HIGH_START = 0xD800;
101 public static final int UNI_SUR_HIGH_END = 0xDBFF;
102 public static final int UNI_SUR_LOW_START = 0xDC00;
103 public static final int UNI_SUR_LOW_END = 0xDFFF;
104 public static final int UNI_REPLACEMENT_CHAR = 0xFFFD;
106 private static final long UNI_MAX_BMP = 0x0000FFFF;
108 private static final int HALF_BASE = 0x0010000;
109 private static final long HALF_SHIFT = 10;
110 private static final long HALF_MASK = 0x3FFL;
112 private static final int SURROGATE_OFFSET =
113 Character.MIN_SUPPLEMENTARY_CODE_POINT -
114 (UNI_SUR_HIGH_START << HALF_SHIFT) - UNI_SUR_LOW_START;
119 public static final class UTF8Result {
120 public byte[] result = new byte[10];
123 public void setLength(int newLength) {
124 if (result.length < newLength) {
125 result = ArrayUtil.grow(result, newLength);
134 public static final class UTF16Result {
135 public char[] result = new char[10];
136 public int[] offsets = new int[10];
139 public void setLength(int newLength) {
140 if (result.length < newLength) {
141 result = ArrayUtil.grow(result, newLength);
146 public void copyText(UTF16Result other) {
147 setLength(other.length);
148 System.arraycopy(other.result, 0, result, 0, length);
152 /** Encode characters from a char[] source, starting at
153 * offset for length chars. Returns a hash of the resulting bytes. After encoding, result.offset will always be 0. */
154 public static int UTF16toUTF8WithHash(final char[] source, final int offset, final int length, BytesRef result) {
158 final int end = offset + length;
159 byte[] out = result.bytes;
160 // Pre-allocate for worst case 4-for-1
161 final int maxLen = length * 4;
162 if (out.length < maxLen)
163 out = result.bytes = new byte[ArrayUtil.oversize(maxLen, 1)];
168 final int code = (int) source[i++];
171 hash = 31*hash + (out[upto++] = (byte) code);
172 } else if (code < 0x800) {
173 hash = 31*hash + (out[upto++] = (byte) (0xC0 | (code >> 6)));
174 hash = 31*hash + (out[upto++] = (byte)(0x80 | (code & 0x3F)));
175 } else if (code < 0xD800 || code > 0xDFFF) {
176 hash = 31*hash + (out[upto++] = (byte)(0xE0 | (code >> 12)));
177 hash = 31*hash + (out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F)));
178 hash = 31*hash + (out[upto++] = (byte)(0x80 | (code & 0x3F)));
181 // confirm valid high surrogate
182 if (code < 0xDC00 && i < end) {
183 int utf32 = (int) source[i];
184 // confirm valid low surrogate and write pair
185 if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
186 utf32 = (code << 10) + utf32 + SURROGATE_OFFSET;
188 hash = 31*hash + (out[upto++] = (byte)(0xF0 | (utf32 >> 18)));
189 hash = 31*hash + (out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F)));
190 hash = 31*hash + (out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F)));
191 hash = 31*hash + (out[upto++] = (byte)(0x80 | (utf32 & 0x3F)));
195 // replace unpaired surrogate or out-of-order low surrogate
196 // with substitution character
197 hash = 31*hash + (out[upto++] = (byte) 0xEF);
198 hash = 31*hash + (out[upto++] = (byte) 0xBF);
199 hash = 31*hash + (out[upto++] = (byte) 0xBD);
202 //assert matches(source, offset, length, out, upto);
203 result.length = upto;
207 /** Encode characters from a char[] source, starting at
208 * offset and stopping when the character 0xffff is seen.
209 * Returns the number of bytes written to bytesOut. */
210 public static void UTF16toUTF8(final char[] source, final int offset, UTF8Result result) {
214 byte[] out = result.result;
218 final int code = (int) source[i++];
220 if (upto+4 > out.length) {
221 out = result.result = ArrayUtil.grow(out, upto+4);
224 out[upto++] = (byte) code;
225 else if (code < 0x800) {
226 out[upto++] = (byte) (0xC0 | (code >> 6));
227 out[upto++] = (byte)(0x80 | (code & 0x3F));
228 } else if (code < 0xD800 || code > 0xDFFF) {
232 out[upto++] = (byte)(0xE0 | (code >> 12));
233 out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
234 out[upto++] = (byte)(0x80 | (code & 0x3F));
237 // confirm valid high surrogate
238 if (code < 0xDC00 && source[i] != 0xffff) {
239 int utf32 = (int) source[i];
240 // confirm valid low surrogate and write pair
241 if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
242 utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
244 out[upto++] = (byte)(0xF0 | (utf32 >> 18));
245 out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
246 out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
247 out[upto++] = (byte)(0x80 | (utf32 & 0x3F));
251 // replace unpaired surrogate or out-of-order low surrogate
252 // with substitution character
253 out[upto++] = (byte) 0xEF;
254 out[upto++] = (byte) 0xBF;
255 out[upto++] = (byte) 0xBD;
258 //assert matches(source, offset, i-offset-1, out, upto);
259 result.length = upto;
262 /** Encode characters from a char[] source, starting at
263 * offset for length chars. Returns the number of bytes
264 * written to bytesOut. */
265 public static void UTF16toUTF8(final char[] source, final int offset, final int length, UTF8Result result) {
269 final int end = offset + length;
270 byte[] out = result.result;
274 final int code = (int) source[i++];
276 if (upto+4 > out.length) {
277 out = result.result = ArrayUtil.grow(out, upto+4);
280 out[upto++] = (byte) code;
281 else if (code < 0x800) {
282 out[upto++] = (byte) (0xC0 | (code >> 6));
283 out[upto++] = (byte)(0x80 | (code & 0x3F));
284 } else if (code < 0xD800 || code > 0xDFFF) {
285 out[upto++] = (byte)(0xE0 | (code >> 12));
286 out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
287 out[upto++] = (byte)(0x80 | (code & 0x3F));
290 // confirm valid high surrogate
291 if (code < 0xDC00 && i < end && source[i] != 0xffff) {
292 int utf32 = (int) source[i];
293 // confirm valid low surrogate and write pair
294 if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
295 utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
297 out[upto++] = (byte)(0xF0 | (utf32 >> 18));
298 out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
299 out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
300 out[upto++] = (byte)(0x80 | (utf32 & 0x3F));
304 // replace unpaired surrogate or out-of-order low surrogate
305 // with substitution character
306 out[upto++] = (byte) 0xEF;
307 out[upto++] = (byte) 0xBF;
308 out[upto++] = (byte) 0xBD;
311 //assert matches(source, offset, length, out, upto);
312 result.length = upto;
315 /** Encode characters from this String, starting at offset
316 * for length characters. Returns the number of bytes
317 * written to bytesOut. */
318 public static void UTF16toUTF8(final String s, final int offset, final int length, UTF8Result result) {
319 final int end = offset + length;
321 byte[] out = result.result;
324 for(int i=offset;i<end;i++) {
325 final int code = (int) s.charAt(i);
327 if (upto+4 > out.length) {
328 out = result.result = ArrayUtil.grow(out, upto+4);
331 out[upto++] = (byte) code;
332 else if (code < 0x800) {
333 out[upto++] = (byte) (0xC0 | (code >> 6));
334 out[upto++] = (byte)(0x80 | (code & 0x3F));
335 } else if (code < 0xD800 || code > 0xDFFF) {
336 out[upto++] = (byte)(0xE0 | (code >> 12));
337 out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
338 out[upto++] = (byte)(0x80 | (code & 0x3F));
341 // confirm valid high surrogate
342 if (code < 0xDC00 && (i < end-1)) {
343 int utf32 = (int) s.charAt(i+1);
344 // confirm valid low surrogate and write pair
345 if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
346 utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
348 out[upto++] = (byte)(0xF0 | (utf32 >> 18));
349 out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
350 out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
351 out[upto++] = (byte)(0x80 | (utf32 & 0x3F));
355 // replace unpaired surrogate or out-of-order low surrogate
356 // with substitution character
357 out[upto++] = (byte) 0xEF;
358 out[upto++] = (byte) 0xBF;
359 out[upto++] = (byte) 0xBD;
362 //assert matches(s, offset, length, out, upto);
363 result.length = upto;
366 /** Encode characters from this String, starting at offset
367 * for length characters. After encoding, result.offset will always be 0.
369 public static void UTF16toUTF8(final CharSequence s, final int offset, final int length, BytesRef result) {
370 final int end = offset + length;
372 byte[] out = result.bytes;
374 // Pre-allocate for worst case 4-for-1
375 final int maxLen = length * 4;
376 if (out.length < maxLen)
377 out = result.bytes = new byte[maxLen];
380 for(int i=offset;i<end;i++) {
381 final int code = (int) s.charAt(i);
384 out[upto++] = (byte) code;
385 else if (code < 0x800) {
386 out[upto++] = (byte) (0xC0 | (code >> 6));
387 out[upto++] = (byte)(0x80 | (code & 0x3F));
388 } else if (code < 0xD800 || code > 0xDFFF) {
389 out[upto++] = (byte)(0xE0 | (code >> 12));
390 out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
391 out[upto++] = (byte)(0x80 | (code & 0x3F));
394 // confirm valid high surrogate
395 if (code < 0xDC00 && (i < end-1)) {
396 int utf32 = (int) s.charAt(i+1);
397 // confirm valid low surrogate and write pair
398 if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
399 utf32 = (code << 10) + utf32 + SURROGATE_OFFSET;
401 out[upto++] = (byte)(0xF0 | (utf32 >> 18));
402 out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
403 out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
404 out[upto++] = (byte)(0x80 | (utf32 & 0x3F));
408 // replace unpaired surrogate or out-of-order low surrogate
409 // with substitution character
410 out[upto++] = (byte) 0xEF;
411 out[upto++] = (byte) 0xBF;
412 out[upto++] = (byte) 0xBD;
415 //assert matches(s, offset, length, out, upto);
416 result.length = upto;
419 /** Encode characters from a char[] source, starting at
420 * offset for length chars. After encoding, result.offset will always be 0.
422 public static void UTF16toUTF8(final char[] source, final int offset, final int length, BytesRef result) {
426 final int end = offset + length;
427 byte[] out = result.bytes;
428 // Pre-allocate for worst case 4-for-1
429 final int maxLen = length * 4;
430 if (out.length < maxLen)
431 out = result.bytes = new byte[maxLen];
436 final int code = (int) source[i++];
439 out[upto++] = (byte) code;
440 else if (code < 0x800) {
441 out[upto++] = (byte) (0xC0 | (code >> 6));
442 out[upto++] = (byte)(0x80 | (code & 0x3F));
443 } else if (code < 0xD800 || code > 0xDFFF) {
444 out[upto++] = (byte)(0xE0 | (code >> 12));
445 out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
446 out[upto++] = (byte)(0x80 | (code & 0x3F));
449 // confirm valid high surrogate
450 if (code < 0xDC00 && i < end) {
451 int utf32 = (int) source[i];
452 // confirm valid low surrogate and write pair
453 if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
454 utf32 = (code << 10) + utf32 + SURROGATE_OFFSET;
456 out[upto++] = (byte)(0xF0 | (utf32 >> 18));
457 out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
458 out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
459 out[upto++] = (byte)(0x80 | (utf32 & 0x3F));
463 // replace unpaired surrogate or out-of-order low surrogate
464 // with substitution character
465 out[upto++] = (byte) 0xEF;
466 out[upto++] = (byte) 0xBF;
467 out[upto++] = (byte) 0xBD;
470 //assert matches(source, offset, length, out, upto);
471 result.length = upto;
474 /** Convert UTF8 bytes into UTF16 characters. If offset
475 * is non-zero, conversion starts at that starting point
476 * in utf8, re-using the results from the previous call
477 * up until offset. */
478 public static void UTF8toUTF16(final byte[] utf8, final int offset, final int length, final UTF16Result result) {
480 final int end = offset + length;
481 char[] out = result.result;
482 if (result.offsets.length <= end) {
483 result.offsets = ArrayUtil.grow(result.offsets, end+1);
485 final int[] offsets = result.offsets;
487 // If incremental decoding fell in the middle of a
488 // single unicode character, rollback to its start:
490 while(offsets[upto] == -1)
493 int outUpto = offsets[upto];
495 // Pre-allocate for worst case 1-for-1
496 if (outUpto+length >= out.length) {
497 out = result.result = ArrayUtil.grow(out, outUpto+length+1);
502 final int b = utf8[upto]&0xff;
505 offsets[upto++] = outUpto;
510 } else if (b < 0xe0) {
511 ch = ((b&0x1f)<<6) + (utf8[upto]&0x3f);
512 offsets[upto++] = -1;
513 } else if (b < 0xf0) {
514 ch = ((b&0xf)<<12) + ((utf8[upto]&0x3f)<<6) + (utf8[upto+1]&0x3f);
515 offsets[upto++] = -1;
516 offsets[upto++] = -1;
519 ch = ((b&0x7)<<18) + ((utf8[upto]&0x3f)<<12) + ((utf8[upto+1]&0x3f)<<6) + (utf8[upto+2]&0x3f);
520 offsets[upto++] = -1;
521 offsets[upto++] = -1;
522 offsets[upto++] = -1;
525 if (ch <= UNI_MAX_BMP) {
526 // target is a character <= 0xFFFF
527 out[outUpto++] = (char) ch;
529 // target is a character in range 0xFFFF - 0x10FFFF
530 final int chHalf = ch - HALF_BASE;
531 out[outUpto++] = (char) ((chHalf >> HALF_SHIFT) + UNI_SUR_HIGH_START);
532 out[outUpto++] = (char) ((chHalf & HALF_MASK) + UNI_SUR_LOW_START);
536 offsets[upto] = outUpto;
537 result.length = outUpto;
540 // Only called from assert
542 private static boolean matches(char[] source, int offset, int length, byte[] result, int upto) {
544 String s1 = new String(source, offset, length);
545 String s2 = new String(result, 0, upto, "UTF-8");
546 if (!s1.equals(s2)) {
547 //System.out.println("DIFF: s1 len=" + s1.length());
548 //for(int i=0;i<s1.length();i++)
549 // System.out.println(" " + i + ": " + (int) s1.charAt(i));
550 //System.out.println("s2 len=" + s2.length());
551 //for(int i=0;i<s2.length();i++)
552 // System.out.println(" " + i + ": " + (int) s2.charAt(i));
554 // If the input string was invalid, then the
556 if (!validUTF16String(s1))
561 return s1.equals(s2);
562 } catch (UnsupportedEncodingException uee) {
567 // Only called from assert
568 private static boolean matches(String source, int offset, int length, byte[] result, int upto) {
570 String s1 = source.substring(offset, offset+length);
571 String s2 = new String(result, 0, upto, "UTF-8");
572 if (!s1.equals(s2)) {
573 // Allow a difference if s1 is not valid UTF-16
575 //System.out.println("DIFF: s1 len=" + s1.length());
576 //for(int i=0;i<s1.length();i++)
577 // System.out.println(" " + i + ": " + (int) s1.charAt(i));
578 //System.out.println(" s2 len=" + s2.length());
579 //for(int i=0;i<s2.length();i++)
580 // System.out.println(" " + i + ": " + (int) s2.charAt(i));
582 // If the input string was invalid, then the
584 if (!validUTF16String(s1))
589 return s1.equals(s2);
590 } catch (UnsupportedEncodingException uee) {
595 public static final boolean validUTF16String(String s) {
596 final int size = s.length();
597 for(int i=0;i<size;i++) {
598 char ch = s.charAt(i);
599 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
602 char nextCH = s.charAt(i);
603 if (nextCH >= UNI_SUR_LOW_START && nextCH <= UNI_SUR_LOW_END) {
604 // Valid surrogate pair
606 // Unmatched high surrogate
609 // Unmatched high surrogate
611 } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
612 // Unmatched low surrogate
619 public static final boolean validUTF16String(char[] s, int size) {
620 for(int i=0;i<size;i++) {
622 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
626 if (nextCH >= UNI_SUR_LOW_START && nextCH <= UNI_SUR_LOW_END) {
627 // Valid surrogate pair
632 } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
633 // Unmatched low surrogate
641 /** Shift value for lead surrogate to form a supplementary character. */
642 private static final int LEAD_SURROGATE_SHIFT_ = 10;
643 /** Mask to retrieve the significant value from a trail surrogate.*/
644 private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
645 /** Trail surrogate minimum value */
646 private static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
647 /** Lead surrogate minimum value */
648 private static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
649 /** The minimum value for Supplementary code points */
650 private static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
651 /** Value that all lead surrogate starts with */
652 private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE
653 - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
656 * Cover JDK 1.5 API. Create a String from an array of codePoints.
658 * @param codePoints The code array
659 * @param offset The start of the text in the code point array
660 * @param count The number of code points
661 * @return a String representing the code points between offset and count
662 * @throws IllegalArgumentException If an invalid code point is encountered
663 * @throws IndexOutOfBoundsException If the offset or count are out of bounds.
665 public static String newString(int[] codePoints, int offset, int count) {
667 throw new IllegalArgumentException();
669 char[] chars = new char[count];
671 for (int r = offset, e = offset + count; r < e; ++r) {
672 int cp = codePoints[r];
673 if (cp < 0 || cp > 0x10ffff) {
674 throw new IllegalArgumentException();
679 chars[w] = (char) cp;
682 chars[w] = (char) (LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_));
683 chars[w + 1] = (char) (TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_));
687 } catch (IndexOutOfBoundsException ex) {
688 int newlen = (int) (Math.ceil((double) codePoints.length * (w + 2)
689 / (r - offset + 1)));
690 char[] temp = new char[newlen];
691 System.arraycopy(chars, 0, temp, 0, w);
696 return new String(chars, 0, w);
700 * Interprets the given byte array as UTF-8 and converts to UTF-16. The {@link CharsRef} will be extended if
701 * it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint.
703 * NOTE: Full characters are read, even if this reads past the length passed (and
704 * can result in an ArrayOutOfBoundsException if invalid UTF-8 is passed).
705 * Explicit checks for valid UTF-8 are not performed.
707 public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars) {
708 int out_offset = chars.offset = 0;
709 final char[] out = chars.chars = ArrayUtil.grow(chars.chars, length);
710 final int limit = offset + length;
711 while (offset < limit) {
712 int b = utf8[offset++]&0xff;
715 out[out_offset++] = (char)b;
716 } else if (b < 0xe0) {
717 out[out_offset++] = (char)(((b&0x1f)<<6) + (utf8[offset++]&0x3f));
718 } else if (b < 0xf0) {
719 out[out_offset++] = (char)(((b&0xf)<<12) + ((utf8[offset]&0x3f)<<6) + (utf8[offset+1]&0x3f));
723 int ch = ((b&0x7)<<18) + ((utf8[offset]&0x3f)<<12) + ((utf8[offset+1]&0x3f)<<6) + (utf8[offset+2]&0x3f);
725 if (ch < UNI_MAX_BMP) {
726 out[out_offset++] = (char)ch;
728 int chHalf = ch - 0x0010000;
729 out[out_offset++] = (char) ((chHalf >> 10) + 0xD800);
730 out[out_offset++] = (char) ((chHalf & HALF_MASK) + 0xDC00);
734 chars.length = out_offset - chars.offset;
738 * Utility method for {@link #UTF8toUTF16(byte[], int, int, CharsRef)}
739 * @see #UTF8toUTF16(byte[], int, int, CharsRef)
741 public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars) {
742 UTF8toUTF16(bytesRef.bytes, bytesRef.offset, bytesRef.length, chars);