1 /*************************************************************************
3 * Copyright 2016 Realm Inc.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 **************************************************************************/
19 #ifndef REALM_UTIL_UTF8_HPP
20 #define REALM_UTIL_UTF8_HPP
25 #include <realm/util/safe_int_ops.hpp>
26 #include <realm/string_data.hpp>
27 #include <realm/util/features.h>
28 #include <realm/utilities.hpp>
34 /// Transcode between UTF-8 and UTF-16.
36 /// \tparam Char16 Must be an integral type with at least 16 bits.
38 /// \tparam Traits16 Must define to_int_type() and to_char_type() for
40 template <class Char16, class Traits16 = std::char_traits<Char16>>
42 /// Transcode as much as possible of the specified UTF-8 input, to
43 /// UTF-16. Returns true if all input characters were transcoded, or
44 /// transcoding stopped because the next character did not fit into the
45 /// output buffer. Returns false if transcoding stopped due to invalid
46 /// input. It is not specified whether this function returns true or false
47 /// if invalid input occurs at the same time as the output buffer runs
48 /// full. In any case, upon return, \a in_begin and \a out_begin are
49 /// advanced to the position where transcoding stopped.
51 /// Throws only if Traits16::to_char_type() throws.
52 static bool to_utf16(const char*& in_begin, const char* in_end, Char16*& out_begin, Char16* out_end);
54 /// Same as to_utf16(), but in reverse.
56 /// Throws only if Traits16::to_int_type() throws.
57 static bool to_utf8(const Char16*& in_begin, const Char16* in_end, char*& out_begin, char* out_end);
59 /// Summarize the number of UTF-16 elements needed to hold the result of
60 /// transcoding the specified UTF-8 string. Upon return, if \a in_begin !=
61 /// \a in_end, then the summation stopped due to invalid UTF-8 input. The
62 /// returned size then reflects the number of UTF-16 elements needed to hold
63 /// the result of transcoding the part of the input that was examined. This
64 /// function will only detect a few UTF-8 validity issues, and can therefore
65 /// not be used for general UTF-8 validation.
66 static size_t find_utf16_buf_size(const char*& in_begin, const char* in_end);
68 /// Summarize the number of UTF-8 bytes needed to hold the result of
69 /// transcoding the specified UTF-16 string. Upon return, if \a in_begin !=
70 /// \a in_end, then the summation stopped due to invalid UTF-16 input, or to
71 /// prevent the returned \c size_t value from overflowing. The returned size
72 /// then reflects the number of UTF-8 bytes needed to hold the result of
73 /// transcoding the part of the input that was examined. This function will
74 /// only detect a few UTF-16 validity issues, and can therefore not be used
75 /// for general UTF-16 validation.
76 static size_t find_utf8_buf_size(const Char16*& in_begin, const Char16* in_end);
82 // Adapted from reference implementation.
83 // http://www.unicode.org/resources/utf8.html
84 // http://www.bsdua.org/files/unicode.tar.gz
85 template <class Char16, class Traits16>
86 inline bool Utf8x16<Char16, Traits16>::to_utf16(const char*& in_begin, const char* const in_end, Char16*& out_begin,
87 Char16* const out_end)
89 typedef std::char_traits<char> traits8;
91 const char* in = in_begin;
92 Char16* out = out_begin;
93 while (in != in_end) {
94 if (REALM_UNLIKELY(out == out_end)) {
95 break; // Need space in output buffer
97 REALM_ASSERT(&in[0] >= in_begin && &in[0] < in_end);
98 uint_fast16_t v1 = uint_fast16_t(traits8::to_int_type(in[0]));
99 if (REALM_LIKELY(v1 < 0x80)) { // One byte
100 // UTF-8 layout: 0xxxxxxx
101 *out++ = Traits16::to_char_type(v1);
105 if (REALM_UNLIKELY(v1 < 0xC0)) {
107 break; // Invalid first byte of UTF-8 sequence
109 if (REALM_LIKELY(v1 < 0xE0)) { // Two bytes
110 if (REALM_UNLIKELY(in_end - in < 2)) {
112 break; // Incomplete UTF-8 sequence
114 REALM_ASSERT(&in[1] >= in_begin && &in[1] < in_end);
115 uint_fast16_t v2 = uint_fast16_t(traits8::to_int_type(in[1]));
116 // UTF-8 layout: 110xxxxx 10xxxxxx
117 if (REALM_UNLIKELY((v2 & 0xC0) != 0x80)) {
119 break; // Invalid continuation byte
121 uint_fast16_t v = uint_fast16_t(((v1 & 0x1F) << 6) | ((v2 & 0x3F) << 0));
122 if (REALM_UNLIKELY(v < 0x80)) {
124 break; // Overlong encoding is invalid
126 *out++ = Traits16::to_char_type(v);
130 if (REALM_LIKELY(v1 < 0xF0)) { // Three bytes
131 if (REALM_UNLIKELY(in_end - in < 3)) {
133 break; // Incomplete UTF-8 sequence
135 REALM_ASSERT(&in[1] >= in_begin && &in[2] < in_end);
136 uint_fast16_t v2 = uint_fast16_t(traits8::to_int_type(in[1]));
137 uint_fast16_t v3 = uint_fast16_t(traits8::to_int_type(in[2]));
138 // UTF-8 layout: 1110xxxx 10xxxxxx 10xxxxxx
139 if (REALM_UNLIKELY((v2 & 0xC0) != 0x80 || (v3 & 0xC0) != 0x80)) {
141 break; // Invalid continuation byte
143 uint_fast16_t v = uint_fast16_t(((v1 & 0x0F) << 12) | ((v2 & 0x3F) << 6) | ((v3 & 0x3F) << 0));
144 if (REALM_UNLIKELY(v < 0x800)) {
146 break; // Overlong encoding is invalid
148 if (REALM_UNLIKELY(0xD800 <= v && v < 0xE000)) {
150 break; // Illegal code point range (reserved for UTF-16 surrogate pairs)
152 *out++ = Traits16::to_char_type(v);
156 if (REALM_UNLIKELY(out + 1 == out_end)) {
157 break; // Need space in output buffer for surrogate pair
159 if (REALM_LIKELY(v1 < 0xF8)) { // Four bytes
160 if (REALM_UNLIKELY(in_end - in < 4)) {
162 break; // Incomplete UTF-8 sequence
164 uint_fast32_t w1 = uint_fast32_t(v1); // 16 bit -> 32 bit
165 REALM_ASSERT(&in[1] >= in_begin && &in[3] < in_end);
166 uint_fast32_t v2 = uint_fast32_t(traits8::to_int_type(in[1])); // 32 bit intended
167 uint_fast16_t v3 = uint_fast16_t(traits8::to_int_type(in[2])); // 16 bit intended
168 uint_fast16_t v4 = uint_fast16_t(traits8::to_int_type(in[3])); // 16 bit intended
169 // UTF-8 layout: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
170 if (REALM_UNLIKELY((v2 & 0xC0) != 0x80 || (v3 & 0xC0) != 0x80 || (v4 & 0xC0) != 0x80)) {
172 break; // Invalid continuation byte
174 uint_fast32_t v = uint_fast32_t(((w1 & 0x07) << 18) | // Parenthesis is 32 bit partial result
175 ((v2 & 0x3F) << 12) | // Parenthesis is 32 bit partial result
176 ((v3 & 0x3F) << 6) | // Parenthesis is 16 bit partial result
177 ((v4 & 0x3F) << 0)); // Parenthesis is 16 bit partial result
178 if (REALM_UNLIKELY(v < 0x10000)) {
180 break; // Overlong encoding is invalid
182 if (REALM_UNLIKELY(0x110000 <= v)) {
184 break; // Code point too big for UTF-16
187 *out++ = Traits16::to_char_type(0xD800 + (v / 0x400));
188 *out++ = Traits16::to_char_type(0xDC00 + (v % 0x400));
192 // Invalid first byte of UTF-8 sequence, or code point too big for UTF-16
197 REALM_ASSERT(in >= in_begin && in <= in_end);
198 REALM_ASSERT(out >= out_begin && out <= out_end);
205 template <class Char16, class Traits16>
206 inline size_t Utf8x16<Char16, Traits16>::find_utf16_buf_size(const char*& in_begin, const char* const in_end)
208 typedef std::char_traits<char> traits8;
210 const char* in = in_begin;
211 while (in != in_end) {
212 REALM_ASSERT(&in[0] >= in_begin && &in[0] < in_end);
213 uint_fast16_t v1 = uint_fast16_t(traits8::to_int_type(in[0]));
214 if (REALM_LIKELY(v1 < 0x80)) { // One byte
219 if (REALM_UNLIKELY(v1 < 0xC0)) {
220 break; // Invalid first byte of UTF-8 sequence
222 if (REALM_LIKELY(v1 < 0xE0)) { // Two bytes
223 if (REALM_UNLIKELY(in_end - in < 2)) {
224 break; // Incomplete UTF-8 sequence
230 if (REALM_LIKELY(v1 < 0xF0)) { // Three bytes
231 if (REALM_UNLIKELY(in_end - in < 3)) {
232 break; // Incomplete UTF-8 sequence
238 if (REALM_LIKELY(v1 < 0xF8)) { // Four bytes
239 if (REALM_UNLIKELY(in_end - in < 4)) {
240 break; // Incomplete UTF-8 sequence
242 num_out += 2; // Surrogate pair
246 // Invalid first byte of UTF-8 sequence, or code point too big for UTF-16
250 REALM_ASSERT(in >= in_begin && in <= in_end);
256 // Adapted from reference implementation.
257 // http://www.unicode.org/resources/utf8.html
258 // http://www.bsdua.org/files/unicode.tar.gz
259 template <class Char16, class Traits16>
260 inline bool Utf8x16<Char16, Traits16>::to_utf8(const Char16*& in_begin, const Char16* const in_end, char*& out_begin,
263 typedef std::char_traits<char> traits8;
264 typedef typename traits8::int_type traits8_int_type;
265 bool invalid = false;
266 const Char16* in = in_begin;
267 char* out = out_begin;
268 while (in != in_end) {
269 REALM_ASSERT(&in[0] >= in_begin && &in[0] < in_end);
270 uint_fast16_t v1 = uint_fast16_t(Traits16::to_int_type(in[0]));
271 if (REALM_LIKELY(v1 < 0x80)) {
272 if (REALM_UNLIKELY(out == out_end)) {
273 break; // Not enough output buffer space
275 // UTF-8 layout: 0xxxxxxx
276 REALM_ASSERT(out >= out_begin && out < out_end);
277 *out++ = traits8::to_char_type(traits8_int_type(v1));
281 if (REALM_LIKELY(v1 < 0x800)) {
282 if (REALM_UNLIKELY(out_end - out < 2)) {
283 break; // Not enough output buffer space
285 // UTF-8 layout: 110xxxxx 10xxxxxx
286 *out++ = traits8::to_char_type(traits8_int_type(0xC0 + v1 / 0x40));
287 REALM_ASSERT(out >= out_begin && out < out_end);
288 *out++ = traits8::to_char_type(traits8_int_type(0x80 + v1 % 0x40));
292 if (REALM_LIKELY(v1 < 0xD800 || 0xE000 <= v1)) {
293 if (REALM_UNLIKELY(out_end - out < 3)) {
294 break; // Not enough output buffer space
296 // UTF-8 layout: 1110xxxx 10xxxxxx 10xxxxxx
297 REALM_ASSERT(out >= out_begin && out + 2 < out_end);
298 *out++ = traits8::to_char_type(traits8_int_type(0xE0 + v1 / 0x1000));
299 *out++ = traits8::to_char_type(traits8_int_type(0x80 + v1 / 0x40 % 0x40));
300 *out++ = traits8::to_char_type(traits8_int_type(0x80 + v1 % 0x40));
306 if (REALM_UNLIKELY(out_end - out < 4)) {
307 break; // Not enough output buffer space
309 if (REALM_UNLIKELY(0xDC00 <= v1)) {
311 break; // Invalid first half of surrogate pair
313 if (REALM_UNLIKELY(in + 1 == in_end)) {
315 break; // Incomplete surrogate pair
317 REALM_ASSERT(&in[1] >= in_begin && &in[1] < in_end);
318 uint_fast16_t v2 = uint_fast16_t(Traits16::to_int_type(in[1]));
319 if (REALM_UNLIKELY(v2 < 0xDC00 || 0xE000 <= v2)) {
321 break; // Invalid second half of surrogate pair
323 uint_fast32_t v = 0x10000l + (uint_fast32_t(v1 - 0xD800) * 0x400 + (v2 - 0xDC00));
324 // UTF-8 layout: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
325 REALM_ASSERT(out >= out_begin && out + 3 < out_end);
326 *out++ = traits8::to_char_type(traits8_int_type(0xF0 + v / 0x40000));
327 *out++ = traits8::to_char_type(traits8_int_type(0x80 + v / 0x1000 % 0x40));
328 *out++ = traits8::to_char_type(traits8_int_type(0x80 + v / 0x40 % 0x40));
329 *out++ = traits8::to_char_type(traits8_int_type(0x80 + v % 0x40));
333 REALM_ASSERT(in >= in_begin && in <= in_end);
334 REALM_ASSERT(out >= out_begin && out <= out_end);
341 template <class Char16, class Traits16>
342 inline size_t Utf8x16<Char16, Traits16>::find_utf8_buf_size(const Char16*& in_begin, const Char16* const in_end)
345 const Char16* in = in_begin;
346 while (in != in_end) {
347 REALM_ASSERT(&in[0] >= in_begin && &in[0] < in_end);
348 uint_fast16_t v = uint_fast16_t(Traits16::to_int_type(in[0]));
349 if (REALM_LIKELY(v < 0x80)) {
350 if (REALM_UNLIKELY(int_add_with_overflow_detect(num_out, 1)))
351 break; // Avoid overflow
354 else if (REALM_LIKELY(v < 0x800)) {
355 if (REALM_UNLIKELY(int_add_with_overflow_detect(num_out, 2)))
356 break; // Avoid overflow
359 else if (REALM_LIKELY(v < 0xD800 || 0xE000 <= v)) {
360 if (REALM_UNLIKELY(int_add_with_overflow_detect(num_out, 3)))
361 break; // Avoid overflow
365 if (REALM_UNLIKELY(in + 1 == in_end)) {
366 break; // Incomplete surrogate pair
368 if (REALM_UNLIKELY(int_add_with_overflow_detect(num_out, 4)))
369 break; // Avoid overflow
373 REALM_ASSERT(in >= in_begin && in <= in_end);
380 #endif // REALM_UTIL_UTF8_HPP