iOS/Pods/Realm/include/core/realm/unicode.hpp

   1 /*************************************************************************
   2  *
   3  * Copyright 2016 Realm Inc.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  * http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  *
  17  **************************************************************************/
  18
  19 #ifndef REALM_UNICODE_HPP
  20 #define REALM_UNICODE_HPP
  21
  22 #include <locale>
  23 #include <cstdint>
  24 #include <string>
  25
  26 #include <realm/string_data.hpp>
  27 #include <realm/util/features.h>
  28 #include <realm/utilities.hpp>
  29
  30
  31 namespace realm {
  32
  33 enum string_compare_method_t {
  34     STRING_COMPARE_CORE,
  35     STRING_COMPARE_CPP11,
  36     STRING_COMPARE_CALLBACK,
  37     STRING_COMPARE_CORE_SIMILAR
  38 };
  39
  40 extern StringCompareCallback string_compare_callback;
  41 extern string_compare_method_t string_compare_method;
  42
  43 // Description for set_string_compare_method():
  44 //
  45 // Short summary: iOS language binding: call
  46 //     set_string_compare_method() for fast but slightly inaccurate sort in some countries, or
  47 //     set_string_compare_method(2, callbackptr) for slow but precise sort (see callbackptr below)
  48 //
  49 // Different countries ('locales') have different sorting order for strings and letters. Because there unfortunatly
  50 // doesn't exist any unified standardized way to compare strings in C++ on multiple platforms, we need this method.
  51 //
  52 // It determins how sorting a TableView by a String column must take place. The 'method' argument can be:
  53 //
  54 // 0: Fast core-only compare (no OS/framework calls). LIMITATIONS: Works only upto 'Latin Extended 2' (unicodes
  55 // 0...591). Also, sorting order is according to 'en_US' so it may be slightly inaccurate for some countries.
  56 // 'callback' argument is ignored.
  57 //
  58 // Return value: Always 'true'
  59 //
  60 // 1: Native C++11 method if core is compiled as C++11. Gives precise sorting according
  61 // to user's current locale. LIMITATIONS: Currently works only on Windows and on Linux with clang. Does NOT work on
  62 // iOS (due to only 'C' locale being available in CoreFoundation, which puts 'Z' before 'a'). Unknown if works on
  63 // Windows Phone / Android. Furthermore it does NOT work on Linux with gcc 4.7 or 4.8 (lack of c++11 feature that
  64 // can convert utf8->wstring without calls to setlocale()).
  65 //
  66 // Return value: 'true' if supported, otherwise 'false' (if so, then previous setting, if any, is preserved).
  67 //
  68 // 2: Callback method. Language binding / C++ user must provide a utf-8 callback method of prototype:
  69 // bool callback(const char* string1, const char* string2) where 'callback' must return bool(string1 < string2).
  70 //
  71 // Return value: Always 'true'
  72 //
  73 // Default is method = 0 if the function is never called
  74 //
  75 // NOT THREAD SAFE! Call once during initialization or make sure it's not called simultaneously with different
  76 // arguments. The setting is remembered per-process; it does NOT need to be called prior to each sort
  77 bool set_string_compare_method(string_compare_method_t method, StringCompareCallback callback);
  78
  79
  80 // Return size in bytes of utf8 character. No error checking
  81 size_t sequence_length(char lead);
  82
  83 // Limitations for case insensitive string search
  84 // Case insensitive search (equal, begins_with, ends_with, like and contains)
  85 // only works for unicodes 0...0x7f which is the same as the 0...127
  86 // ASCII character set (letters a-z and A-Z).
  87
  88 // In does *not* work for the 0...255 ANSI character set that contains
  89 // characters from many European countries like Germany, France, Denmark,
  90 // etc.
  91
  92 // It also does not work for characters from non-western countries like
  93 // Japan, Russia, Arabia, etc.
  94
  95 // If there exists characters outside the ASCII range either in the text
  96 // to be searched for, or in the Realm string column which is searched
  97 // in, then the compare yields a random result such that the row may or
  98 // may not be included in the result set.
  99
 100 // Return bool(string1 < string2)
 101 bool utf8_compare(StringData string1, StringData string2);
 102
 103 // Return unicode value of character.
 104 uint32_t utf8value(const char* character);
 105
 106 inline bool equal_sequence(const char*& begin, const char* end, const char* begin2);
 107
 108 // FIXME: The current approach to case insensitive comparison requires
 109 // that case mappings can be done in a way that does not change he
 110 // number of bytes used to encode the individual Unicode
 111 // character. This is not generally the case, so, as far as I can see,
 112 // this approach has no future.
 113 //
 114 // FIXME: The current approach to case insensitive comparison relies
 115 // on checking each "haystack" character against the corresponding
 116 // character in both a lower cased and an upper cased version of the
 117 // "needle". While this leads to efficient comparison, it ignores the
 118 // fact that "case folding" is the only correct approach to case
 119 // insensitive comparison in a locale agnostic Unicode
 120 // environment.
 121 //
 122 // See
 123 //   http://www.w3.org/International/wiki/Case_folding
 124 //   http://userguide.icu-project.org/transforms/casemappings#TOC-Case-Folding.
 125 //
 126 // The ideal API would probably be something like this:
 127 //
 128 //   case_fold:        utf_8 -> case_folded
 129 //   equal_case_fold:  (needle_case_folded, single_haystack_entry_utf_8) -> found
 130 //   search_case_fold: (needle_case_folded, huge_haystack_string_utf_8) -> found_at_position
 131 //
 132 // The case folded form would probably be using UTF-32 or UTF-16.
 133
 134
 135 /// If successful, returns a string of the same size as \a source.
 136 /// Returns none if invalid UTF-8 encoding was encountered.
 137 util::Optional<std::string> case_map(StringData source, bool upper);
 138
 139 enum IgnoreErrorsTag { IgnoreErrors };
 140 std::string case_map(StringData source, bool upper, IgnoreErrorsTag);
 141
 142 /// Assumes that the sizes of \a needle_upper and \a needle_lower are
 143 /// identical to the size of \a haystack. Returns false if the needle
 144 /// is different from the haystack.
 145 bool equal_case_fold(StringData haystack, const char* needle_upper, const char* needle_lower);
 146
 147 /// Assumes that the sizes of \a needle_upper and \a needle_lower are
 148 /// both equal to \a needle_size. Returns haystack.size() if the
 149 /// needle was not found.
 150 size_t search_case_fold(StringData haystack, const char* needle_upper, const char* needle_lower, size_t needle_size);
 151
 152 /// Assumes that the sizes of \a needle_upper and \a needle_lower are
 153 /// both equal to \a needle_size. Returns false if the
 154 /// needle was not found.
 155 bool contains_ins(StringData haystack, const char* needle_upper, const char* needle_lower, size_t needle_size, const std::array<uint8_t, 256> &charmap);
 156
 157 /// Case insensitive wildcard matching ('?' for single char, '*' for zero or more chars)
 158 bool string_like_ins(StringData text, StringData pattern) noexcept;
 159 bool string_like_ins(StringData text, StringData upper, StringData lower) noexcept;
 160
 161 } // namespace realm
 162
 163 #endif // REALM_UNICODE_HPP