1 /*************************************************************************
3 * Copyright 2016 Realm Inc.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 **************************************************************************/
19 #ifndef REALM_INDEX_STRING_HPP
20 #define REALM_INDEX_STRING_HPP
26 #include <realm/array.hpp>
27 #include <realm/column_fwd.hpp>
30 The StringIndex class is used for both type_String and all integral types, such as type_Bool, type_OldDateTime and
31 type_Int. When used for integral types, the 64-bit integer is simply casted to a string of 8 bytes through a
32 pretty simple "wrapper layer" in all public methods.
34 The StringIndex data structure is like an "inversed" B+ tree where the leafs contain row indexes and the non-leafs
35 contain 4-byte chunks of payload. Imagine a table with following strings:
37 hello, kitty, kitten, foobar, kitty, foobar
39 The topmost level of the index tree contains prefixes of the payload strings of length <= 4. The next level contains
40 prefixes of the remaining parts of the strings. Unnecessary levels of the tree are optimized away; the prefix "foob"
41 is shared only by rows that are identical ("foobar"), so "ar" is not needed to be stored in the tree.
49 Each non-leafs consists of two integer arrays of the same length, one containing payload and the other containing
50 references to the sublevel nodes.
52 The leafs can be either a single value or a Column. If the reference in its parent node has its least significant
53 bit set, then the remaining upper bits specify the row index at which the string is stored. If the bit is clear,
54 it must be interpreted as a reference to a Column that stores the row indexes at which the string is stored.
56 If a Column is used, then all row indexes are guaranteed to be sorted increasingly, which means you an search in it
57 using our binary search functions such as upper_bound() and lower_bound(). Each duplicate value will be stored in
58 the same Column, but Columns may contain more than just duplicates if the depth of the tree exceeds the value
59 `s_max_offset` This is to avoid stack overflow problems with many of our recursive functions if we have two very
60 long strings that have a long common prefix but differ in the last couple bytes. If a Column stores more than just
61 duplicates, then the list is kept sorted in ascending order by string value and within the groups of common
62 strings, the rows are sorted in ascending order.
70 class IndexArray : public Array {
72 IndexArray(Allocator& allocator)
77 size_t index_string_find_first(StringData value, ColumnBase* column) const;
78 void index_string_find_all(IntegerColumn& result, StringData value, ColumnBase* column, bool case_insensitive = false) const;
79 FindRes index_string_find_all_no_copy(StringData value, ColumnBase* column, InternalFindResult& result) const;
80 size_t index_string_count(StringData value, ColumnBase* column) const;
83 template <IndexMethod>
84 size_t from_list(StringData value, InternalFindResult& result_ref, const IntegerColumn& rows,
85 ColumnBase* column) const;
87 void from_list_all(StringData value, IntegerColumn& result, const IntegerColumn& rows, ColumnBase* column) const;
89 void from_list_all_ins(StringData value, std::vector<size_t>& result, const IntegerColumn& rows,
90 ColumnBase* column) const;
92 template <IndexMethod method>
93 size_t index_string(StringData value, InternalFindResult& result_ref, ColumnBase* column) const;
95 void index_string_all(StringData value, IntegerColumn& result, ColumnBase* column) const;
97 void index_string_all_ins(StringData value, IntegerColumn& result, ColumnBase* column) const;
103 StringIndex(ColumnBase* target_column, Allocator&);
104 StringIndex(ref_type, ArrayParent*, size_t ndx_in_parent, ColumnBase* target_column, Allocator&);
105 ~StringIndex() noexcept
109 static ref_type create_empty(Allocator& alloc);
111 void set_target(ColumnBase* target_column) noexcept;
114 Allocator& get_alloc() const noexcept;
115 void destroy() noexcept;
117 bool is_attached() const noexcept;
118 void set_parent(ArrayParent* parent, size_t ndx_in_parent) noexcept;
119 size_t get_ndx_in_parent() const noexcept;
120 void set_ndx_in_parent(size_t ndx_in_parent) noexcept;
121 void update_from_parent(size_t old_baseline) noexcept;
122 void refresh_accessor_tree(size_t, const Spec&);
123 ref_type get_ref() const noexcept;
125 // StringIndex interface:
127 // 12 is the biggest element size of any non-string/binary Realm type
128 static const size_t string_conversion_buffer_size = 12;
129 using StringConversionBuffer = std::array<char, string_conversion_buffer_size>;
131 bool is_empty() const;
134 void insert(size_t row_ndx, T value, size_t num_rows, bool is_append);
136 void insert(size_t row_ndx, util::Optional<T> value, size_t num_rows, bool is_append);
139 void set(size_t row_ndx, T new_value);
141 void set(size_t row_ndx, util::Optional<T> new_value);
144 void erase(size_t row_ndx, bool is_last);
147 size_t find_first(T value) const;
149 void find_all(IntegerColumn& result, T value, bool case_insensitive = false) const;
151 FindRes find_all_no_copy(T value, InternalFindResult& result) const;
153 size_t count(T value) const;
155 void update_ref(T value, size_t old_row_ndx, size_t new_row_ndx);
159 void distinct(IntegerColumn& result) const;
160 bool has_duplicate_values() const noexcept;
164 template <typename T>
165 void verify_entries(const T& column) const;
166 void do_dump_node_structure(std::ostream&, int) const;
168 void to_dot(std::ostream&, StringData title = StringData()) const;
169 void to_dot_2(std::ostream&, StringData title = StringData()) const;
172 typedef int32_t key_type;
174 // s_max_offset specifies the number of levels of recursive string indexes
175 // allowed before storing everything in lists. This is to avoid nesting
176 // to too deep of a level. Since every SubStringIndex stores 4 bytes, this
177 // means that a StringIndex is helpful for strings of a common prefix up to
178 // 4 times this limit (200 bytes shared). Lists are stored in sorted order,
179 // so strings sharing a common prefix of more than this limit will use a
180 // binary search of approximate complexity log2(n) from `std::lower_bound`.
181 static const size_t s_max_offset = 200; // max depth * s_index_key_length
182 static const size_t s_index_key_length = 4;
183 static key_type create_key(StringData) noexcept;
184 static key_type create_key(StringData, size_t) noexcept;
187 // m_array is a compact representation for storing the children of this StringIndex.
190 // 2) a reference to a list which stores row numbers (for duplicate strings).
191 // 3) a reference to a sub-index
192 // m_array[0] is always a reference to a values array which stores the 4 byte chunk
193 // of payload data for quick string chunk comparisons. The array stored
194 // at m_array[0] lines up with the indices of values in m_array[1] so for example
195 // starting with an empty StringIndex:
196 // StringColumn::insert(target_row_ndx=42, value="test_string") would result with
197 // get_array_from_ref(m_array[0])[0] == create_key("test") and
199 // In this way, m_array which stores one child has a size of two.
200 // Children are type 1 (row number) if the LSB of the value is set.
201 // To get the actual row value, shift value down by one.
202 // If the LSB of the value is 0 then the value is a reference and can be either
203 // type 2, or type 3 (no shifting in either case).
204 // References point to a list if the context header flag is NOT set.
205 // If the header flag is set, references point to a sub-StringIndex (nesting).
206 std::unique_ptr<IndexArray> m_array;
207 ColumnBase* m_target_column;
209 struct inner_node_tag {
211 StringIndex(inner_node_tag, Allocator&);
213 static IndexArray* create_node(Allocator&, bool is_leaf);
215 void insert_with_offset(size_t row_ndx, StringData value, size_t offset);
216 void insert_row_list(size_t ref, size_t offset, StringData value);
217 void insert_to_existing_list(size_t row, StringData value, IntegerColumn& list);
218 void insert_to_existing_list_at_lower(size_t row, StringData value, IntegerColumn& list,
219 const IntegerColumnIterator& lower);
220 key_type get_last_key() const;
222 /// Add small signed \a diff to all elements that are greater than, or equal
223 /// to \a min_row_ndx.
224 void adjust_row_indexes(size_t min_row_ndx, int diff);
229 enum ChangeType { none, insert_before, insert_after, split } type;
230 NodeChange(ChangeType t, size_t r1 = 0, size_t r2 = 0)
245 void TreeInsert(size_t row_ndx, key_type, size_t offset, StringData value);
246 NodeChange do_insert(size_t ndx, key_type, size_t offset, StringData value);
247 /// Returns true if there is room or it can join existing entries
248 bool leaf_insert(size_t row_ndx, key_type, size_t offset, StringData value, bool noextend = false);
249 void node_insert_split(size_t ndx, size_t new_ref);
250 void node_insert(size_t ndx, size_t ref);
251 void do_delete(size_t ndx, StringData, size_t offset);
252 void do_update_ref(StringData value, size_t row_ndx, size_t new_row_ndx, size_t offset);
254 StringData get(size_t ndx, StringConversionBuffer& buffer) const;
256 void node_add_key(ref_type ref);
259 static void dump_node_structure(const Array& node, std::ostream&, int level);
260 static void array_to_dot(std::ostream&, const Array&);
261 static void keys_to_dot(std::ostream&, const Array&, StringData title = StringData());
266 class SortedListComparator {
268 SortedListComparator(ColumnBase& column_values);
269 bool operator()(int64_t ndx, StringData needle);
270 bool operator()(StringData needle, int64_t ndx);
283 struct GetIndexData<int64_t> {
284 static StringData get_index_data(const int64_t& value, StringIndex::StringConversionBuffer& buffer)
286 const char* c = reinterpret_cast<const char*>(&value);
287 realm::safe_copy_n(c, sizeof(int64_t), buffer.data());
288 return StringData{buffer.data(), sizeof(int64_t)};
293 struct GetIndexData<StringData> {
294 static StringData get_index_data(StringData data, StringIndex::StringConversionBuffer&)
301 struct GetIndexData<null> {
302 static StringData get_index_data(null, StringIndex::StringConversionBuffer&)
309 struct GetIndexData<Timestamp> {
310 static StringData get_index_data(const Timestamp&, StringIndex::StringConversionBuffer&);
314 struct GetIndexData<util::Optional<T>> {
315 static StringData get_index_data(const util::Optional<T>& value, StringIndex::StringConversionBuffer& buffer)
318 return GetIndexData<T>::get_index_data(*value, buffer);
324 struct GetIndexData<float> {
325 static StringData get_index_data(float, StringIndex::StringConversionBuffer&)
327 REALM_ASSERT_RELEASE(false); // LCOV_EXCL_LINE; Index on float not supported
332 struct GetIndexData<double> {
333 static StringData get_index_data(double, StringIndex::StringConversionBuffer&)
335 REALM_ASSERT_RELEASE(false); // LCOV_EXCL_LINE; Index on float not supported
340 struct GetIndexData<const char*> : GetIndexData<StringData> {
343 // to_str() is used by the integer index. The existing StringIndex is re-used for this
344 // by making IntegerColumn convert its integers to strings by calling to_str().
347 inline StringData to_str(T&& value, StringIndex::StringConversionBuffer& buffer)
349 return GetIndexData<typename std::remove_reference<T>::type>::get_index_data(value, buffer);
353 inline StringIndex::StringIndex(ColumnBase* target_column, Allocator& alloc)
354 : m_array(create_node(alloc, true)) // Throws
355 , m_target_column(target_column)
359 inline StringIndex::StringIndex(ref_type ref, ArrayParent* parent, size_t ndx_in_parent, ColumnBase* target_column,
361 : m_array(new IndexArray(alloc))
362 , m_target_column(target_column)
364 REALM_ASSERT_EX(Array::get_context_flag_from_header(alloc.translate(ref)), ref, size_t(alloc.translate(ref)));
365 m_array->init_from_ref(ref);
366 set_parent(parent, ndx_in_parent);
369 inline StringIndex::StringIndex(inner_node_tag, Allocator& alloc)
370 : m_array(create_node(alloc, false)) // Throws
371 , m_target_column(nullptr)
375 // Byte order of the key is *reversed*, so that for the integer index, the least significant
376 // byte comes first, so that it fits little-endian machines. That way we can perform fast
377 // range-lookups and iterate in order, etc, as future features. This, however, makes the same
378 // features slower for string indexes. Todo, we should reverse the order conditionally, depending
379 // on the column type.
380 inline StringIndex::key_type StringIndex::create_key(StringData str) noexcept
386 if (str.size() < 2) {
395 // Create 4 byte index key
396 // (encoded like this to allow literal comparisons
397 // independently of endianness)
399 key |= (key_type(static_cast<unsigned char>(str[3])) << 0);
401 key |= (key_type(static_cast<unsigned char>(str[2])) << 8);
403 key |= (key_type(static_cast<unsigned char>(str[1])) << 16);
405 key |= (key_type(static_cast<unsigned char>(str[0])) << 24);
410 // Index works as follows: All non-NULL values are stored as if they had appended an 'X' character at the end. So
411 // "foo" is stored as if it was "fooX", and "" (empty string) is stored as "X". And NULLs are stored as empty strings.
412 inline StringIndex::key_type StringIndex::create_key(StringData str, size_t offset) noexcept
417 if (offset > str.size())
420 // for very short strings
421 size_t tail = str.size() - offset;
422 if (tail <= sizeof(key_type) - 1) {
423 char buf[sizeof(key_type)];
424 memset(buf, 0, sizeof(key_type));
426 memcpy(buf, str.data() + offset, tail);
427 return create_key(StringData(buf, tail + 1));
430 return create_key(str.substr(offset));
434 void StringIndex::insert(size_t row_ndx, T value, size_t num_rows, bool is_append)
436 REALM_ASSERT_3(row_ndx, !=, npos);
438 // If the new row is inserted after the last row in the table, we don't need
439 // to adjust any row indexes.
441 for (size_t i = 0; i < num_rows; ++i) {
442 size_t row_ndx_2 = row_ndx + i;
443 adjust_row_indexes(row_ndx_2, 1); // Throws
447 StringConversionBuffer buffer;
449 for (size_t i = 0; i < num_rows; ++i) {
450 size_t row_ndx_2 = row_ndx + i;
451 size_t offset = 0; // First key from beginning of string
452 insert_with_offset(row_ndx_2, to_str(value, buffer), offset); // Throws
457 void StringIndex::insert(size_t row_ndx, util::Optional<T> value, size_t num_rows, bool is_append)
460 insert(row_ndx, *value, num_rows, is_append);
463 insert(row_ndx, null{}, num_rows, is_append);
468 void StringIndex::set(size_t row_ndx, T new_value)
470 StringConversionBuffer buffer;
471 StringConversionBuffer buffer2;
472 StringData old_value = get(row_ndx, buffer);
473 StringData new_value2 = to_str(new_value, buffer2);
475 // Note that insert_with_offset() throws UniqueConstraintViolation.
477 if (REALM_LIKELY(new_value2 != old_value)) {
478 // We must erase this row first because erase uses find_first which
479 // might find the duplicate if we insert before erasing.
480 bool is_last = true; // To avoid updating refs
481 erase<T>(row_ndx, is_last); // Throws
483 size_t offset = 0; // First key from beginning of string
484 insert_with_offset(row_ndx, new_value2, offset); // Throws
489 void StringIndex::set(size_t row_ndx, util::Optional<T> new_value)
492 set(row_ndx, *new_value);
495 set(row_ndx, null{});
500 void StringIndex::erase(size_t row_ndx, bool is_last)
502 StringConversionBuffer buffer;
503 StringData value = get(row_ndx, buffer);
505 do_delete(row_ndx, value, 0);
507 // Collapse top nodes with single item
508 while (m_array->is_inner_bptree_node()) {
509 REALM_ASSERT(m_array->size() > 1); // node cannot be empty
510 if (m_array->size() > 2)
513 ref_type ref = m_array->get_as_ref(1);
514 m_array->set(1, 1); // avoid destruction of the extracted ref
515 m_array->destroy_deep();
516 m_array->init_from_ref(ref);
517 m_array->update_parent();
520 // If it is last item in column, we don't have to update refs
522 adjust_row_indexes(row_ndx, -1);
526 size_t StringIndex::find_first(T value) const
528 // Use direct access method
529 StringConversionBuffer buffer;
530 return m_array->index_string_find_first(to_str(value, buffer), m_target_column);
534 void StringIndex::find_all(IntegerColumn& result, T value, bool case_insensitive) const
536 // Use direct access method
537 StringConversionBuffer buffer;
538 return m_array->index_string_find_all(result, to_str(value, buffer), m_target_column, case_insensitive);
542 FindRes StringIndex::find_all_no_copy(T value, InternalFindResult& result) const
544 // Use direct access method
545 StringConversionBuffer buffer;
546 return m_array->index_string_find_all_no_copy(to_str(value, buffer), m_target_column, result);
550 size_t StringIndex::count(T value) const
552 // Use direct access method
553 StringConversionBuffer buffer;
554 return m_array->index_string_count(to_str(value, buffer), m_target_column);
558 void StringIndex::update_ref(T value, size_t old_row_ndx, size_t new_row_ndx)
560 StringConversionBuffer buffer;
561 do_update_ref(to_str(value, buffer), old_row_ndx, new_row_ndx, 0);
564 inline void StringIndex::destroy() noexcept
566 return m_array->destroy_deep();
569 inline bool StringIndex::is_attached() const noexcept
571 return m_array->is_attached();
574 inline void StringIndex::refresh_accessor_tree(size_t, const Spec&)
576 m_array->init_from_parent();
579 inline ref_type StringIndex::get_ref() const noexcept
581 return m_array->get_ref();
584 inline void StringIndex::set_parent(ArrayParent* parent, size_t ndx_in_parent) noexcept
586 m_array->set_parent(parent, ndx_in_parent);
589 inline size_t StringIndex::get_ndx_in_parent() const noexcept
591 return m_array->get_ndx_in_parent();
594 inline void StringIndex::set_ndx_in_parent(size_t ndx_in_parent) noexcept
596 m_array->set_ndx_in_parent(ndx_in_parent);
599 inline void StringIndex::update_from_parent(size_t old_baseline) noexcept
601 m_array->update_from_parent(old_baseline);
606 #endif // REALM_INDEX_STRING_HPP