1 /*************************************************************************
3 * Copyright 2016 Realm Inc.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 **************************************************************************/
19 #ifndef REALM_UNICODE_HPP
20 #define REALM_UNICODE_HPP
26 #include <realm/string_data.hpp>
27 #include <realm/util/features.h>
28 #include <realm/utilities.hpp>
33 enum string_compare_method_t {
36 STRING_COMPARE_CALLBACK,
37 STRING_COMPARE_CORE_SIMILAR
40 extern StringCompareCallback string_compare_callback;
41 extern string_compare_method_t string_compare_method;
43 // Description for set_string_compare_method():
45 // Short summary: iOS language binding: call
46 // set_string_compare_method() for fast but slightly inaccurate sort in some countries, or
47 // set_string_compare_method(2, callbackptr) for slow but precise sort (see callbackptr below)
49 // Different countries ('locales') have different sorting order for strings and letters. Because there unfortunatly
50 // doesn't exist any unified standardized way to compare strings in C++ on multiple platforms, we need this method.
52 // It determins how sorting a TableView by a String column must take place. The 'method' argument can be:
54 // 0: Fast core-only compare (no OS/framework calls). LIMITATIONS: Works only upto 'Latin Extended 2' (unicodes
55 // 0...591). Also, sorting order is according to 'en_US' so it may be slightly inaccurate for some countries.
56 // 'callback' argument is ignored.
58 // Return value: Always 'true'
60 // 1: Native C++11 method if core is compiled as C++11. Gives precise sorting according
61 // to user's current locale. LIMITATIONS: Currently works only on Windows and on Linux with clang. Does NOT work on
62 // iOS (due to only 'C' locale being available in CoreFoundation, which puts 'Z' before 'a'). Unknown if works on
63 // Windows Phone / Android. Furthermore it does NOT work on Linux with gcc 4.7 or 4.8 (lack of c++11 feature that
64 // can convert utf8->wstring without calls to setlocale()).
66 // Return value: 'true' if supported, otherwise 'false' (if so, then previous setting, if any, is preserved).
68 // 2: Callback method. Language binding / C++ user must provide a utf-8 callback method of prototype:
69 // bool callback(const char* string1, const char* string2) where 'callback' must return bool(string1 < string2).
71 // Return value: Always 'true'
73 // Default is method = 0 if the function is never called
75 // NOT THREAD SAFE! Call once during initialization or make sure it's not called simultaneously with different
76 // arguments. The setting is remembered per-process; it does NOT need to be called prior to each sort
77 bool set_string_compare_method(string_compare_method_t method, StringCompareCallback callback);
80 // Return size in bytes of utf8 character. No error checking
81 size_t sequence_length(char lead);
83 // Limitations for case insensitive string search
84 // Case insensitive search (equal, begins_with, ends_with, like and contains)
85 // only works for unicodes 0...0x7f which is the same as the 0...127
86 // ASCII character set (letters a-z and A-Z).
88 // In does *not* work for the 0...255 ANSI character set that contains
89 // characters from many European countries like Germany, France, Denmark,
92 // It also does not work for characters from non-western countries like
93 // Japan, Russia, Arabia, etc.
95 // If there exists characters outside the ASCII range either in the text
96 // to be searched for, or in the Realm string column which is searched
97 // in, then the compare yields a random result such that the row may or
98 // may not be included in the result set.
100 // Return bool(string1 < string2)
101 bool utf8_compare(StringData string1, StringData string2);
103 // Return unicode value of character.
104 uint32_t utf8value(const char* character);
106 inline bool equal_sequence(const char*& begin, const char* end, const char* begin2);
108 // FIXME: The current approach to case insensitive comparison requires
109 // that case mappings can be done in a way that does not change he
110 // number of bytes used to encode the individual Unicode
111 // character. This is not generally the case, so, as far as I can see,
112 // this approach has no future.
114 // FIXME: The current approach to case insensitive comparison relies
115 // on checking each "haystack" character against the corresponding
116 // character in both a lower cased and an upper cased version of the
117 // "needle". While this leads to efficient comparison, it ignores the
118 // fact that "case folding" is the only correct approach to case
119 // insensitive comparison in a locale agnostic Unicode
123 // http://www.w3.org/International/wiki/Case_folding
124 // http://userguide.icu-project.org/transforms/casemappings#TOC-Case-Folding.
126 // The ideal API would probably be something like this:
128 // case_fold: utf_8 -> case_folded
129 // equal_case_fold: (needle_case_folded, single_haystack_entry_utf_8) -> found
130 // search_case_fold: (needle_case_folded, huge_haystack_string_utf_8) -> found_at_position
132 // The case folded form would probably be using UTF-32 or UTF-16.
135 /// If successful, returns a string of the same size as \a source.
136 /// Returns none if invalid UTF-8 encoding was encountered.
137 util::Optional<std::string> case_map(StringData source, bool upper);
139 enum IgnoreErrorsTag { IgnoreErrors };
140 std::string case_map(StringData source, bool upper, IgnoreErrorsTag);
142 /// Assumes that the sizes of \a needle_upper and \a needle_lower are
143 /// identical to the size of \a haystack. Returns false if the needle
144 /// is different from the haystack.
145 bool equal_case_fold(StringData haystack, const char* needle_upper, const char* needle_lower);
147 /// Assumes that the sizes of \a needle_upper and \a needle_lower are
148 /// both equal to \a needle_size. Returns haystack.size() if the
149 /// needle was not found.
150 size_t search_case_fold(StringData haystack, const char* needle_upper, const char* needle_lower, size_t needle_size);
152 /// Assumes that the sizes of \a needle_upper and \a needle_lower are
153 /// both equal to \a needle_size. Returns false if the
154 /// needle was not found.
155 bool contains_ins(StringData haystack, const char* needle_upper, const char* needle_lower, size_t needle_size, const std::array<uint8_t, 256> &charmap);
157 /// Case insensitive wildcard matching ('?' for single char, '*' for zero or more chars)
158 bool string_like_ins(StringData text, StringData pattern) noexcept;
159 bool string_like_ins(StringData text, StringData upper, StringData lower) noexcept;
163 #endif // REALM_UNICODE_HPP