1 /*************************************************************************
3 * Copyright 2016 Realm Inc.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 **************************************************************************/
19 #ifndef REALM_ALLOC_SLAB_HPP
20 #define REALM_ALLOC_SLAB_HPP
22 #include <cstdint> // unint8_t etc
27 #include <realm/util/features.h>
28 #include <realm/util/file.hpp>
29 #include <realm/alloc.hpp>
30 #include <realm/disable_sync_to_disk.hpp>
39 /// Thrown by Group and SharedGroup constructors if the specified file
40 /// (or memory buffer) does not appear to contain a valid Realm
42 struct InvalidDatabase;
45 /// The allocator that is used to manage the memory of a Realm
46 /// group, i.e., a Realm database.
48 /// Optionally, it can be attached to an pre-existing database (file
49 /// or memory buffer) which then becomes an immuatble part of the
52 /// To attach a slab allocator to a pre-existing database, call
53 /// attach_file() or attach_buffer(). To create a new database
54 /// in-memory, call attach_empty().
56 /// For efficiency, this allocator manages its mutable memory as a set
58 class SlabAlloc : public Allocator {
60 ~SlabAlloc() noexcept override;
63 // Disable copying. Copying an allocator can produce double frees.
64 SlabAlloc(const SlabAlloc&) = delete;
65 SlabAlloc& operator=(const SlabAlloc&) = delete;
68 /// \brief Storage for combining setup flags for initialization to
71 /// \var Config::is_shared
72 /// Must be true if, and only if we are called on behalf of SharedGroup.
74 /// \var Config::read_only
75 /// Open the file in read-only mode. This implies \a Config::no_create.
77 /// \var Config::no_create
78 /// Fail if the file does not already exist.
80 /// \var Config::skip_validate
81 /// Skip validation of file header. In a
82 /// set of overlapping SharedGroups, only the first one (the one
83 /// that creates/initlializes the coordination file) may validate
84 /// the header, otherwise it will result in a race condition.
86 /// \var Config::encryption_key
87 /// 32-byte key to use to encrypt and decrypt the backing storage,
88 /// or nullptr to disable encryption.
90 /// \var Config::session_initiator
91 /// If set, the caller is the session initiator and
92 /// guarantees exclusive access to the file. If attaching in
93 /// read/write mode, the file is modified: files on streaming form
94 /// is changed to non-streaming form, and if needed the file size
95 /// is adjusted to match mmap boundaries.
96 /// Must be set to false if is_shared is false.
98 /// \var Config::clear_file
99 /// Always initialize the file as if it was a newly
100 /// created file and ignore any pre-existing contents. Requires that
101 /// Config::session_initiator be true as well.
103 bool is_shared = false;
104 bool read_only = false;
105 bool no_create = false;
106 bool skip_validate = false;
107 bool session_initiator = false;
108 bool clear_file = false;
109 const char* encryption_key = nullptr;
115 /// \brief Attach this allocator to the specified file.
117 /// It is an error if this function is called at a time where the specified
118 /// Realm file (file system inode) is modified asynchronously.
120 /// In non-shared mode (when this function is called on behalf of a
121 /// free-standing Group instance), it is the responsibility of the
122 /// application to ensure that the Realm file is not modified concurrently
123 /// from any other thread or process.
125 /// In shared mode (when this function is called on behalf of a SharedGroup
126 /// instance), the caller (SharedGroup::do_open()) must take steps to ensure
127 /// cross-process mutual exclusion.
129 /// Except for \a file_path, the parameters are passed in through a
130 /// configuration object.
132 /// \return The `ref` of the root node, or zero if there is none.
134 /// Please note that attach_file can fail to attach to a file due to a
135 /// collision with a writer extending the file. This can only happen if the
136 /// caller is *not* the session initiator. When this happens, attach_file()
137 /// throws SlabAlloc::Retry, and the caller must retry the call. The caller
138 /// should check if it has become the session initiator before retrying.
139 /// This can happen if the conflicting thread (or process) terminates or
140 /// crashes before the next retry.
142 /// \throw util::File::AccessError
143 /// \throw SlabAlloc::Retry
144 ref_type attach_file(const std::string& file_path, Config& cfg);
146 /// Get the attached file. Only valid when called on an allocator with
147 /// an attached file.
148 util::File& get_file();
150 /// Attach this allocator to the specified memory buffer.
152 /// It is an error to call this function on an attached
153 /// allocator. Doing so will result in undefined behavor.
155 /// \return The `ref` of the root node, or zero if there is none.
159 /// \throw InvalidDatabase
160 ref_type attach_buffer(const char* data, size_t size);
162 /// Reads file format from file header. Must be called from within a write
164 int get_committed_file_format_version() const noexcept;
166 /// Attach this allocator to an empty buffer.
168 /// It is an error to call this function on an attached
169 /// allocator. Doing so will result in undefined behavor.
172 /// Detach from a previously attached file or buffer.
174 /// This function does not reset free space tracking. To
175 /// completely reset the allocator, you must also call
176 /// reset_free_space_tracking().
178 /// This function has no effect if the allocator is already in the
179 /// detached state (idempotency).
180 void detach() noexcept;
184 /// If a memory buffer has been attached using attach_buffer(),
185 /// mark it as owned by this slab allocator. Behaviour is
186 /// undefined if this function is called on a detached allocator,
187 /// one that is not attached using attach_buffer(), or one for
188 /// which this function has already been called during the latest
190 void own_buffer() noexcept;
192 /// Returns true if, and only if this allocator is currently
193 /// in the attached state.
194 bool is_attached() const noexcept;
196 /// Returns true if, and only if this allocator is currently in
197 /// the attached state and attachment was not established using
199 bool nonempty_attachment() const noexcept;
201 /// Reserve disk space now to avoid allocation errors at a later
202 /// point in time, and to minimize on-disk fragmentation. In some
203 /// cases, less fragmentation translates into improved
204 /// performance. On flash or SSD-drives this is likely a waste.
206 /// Note: File::prealloc() may misbehave under race conditions (see
207 /// documentation of File::prealloc()). For that reason, to avoid race
208 /// conditions, when this allocator is used in a transactional mode, this
209 /// function may be called only when the caller has exclusive write
210 /// access. In non-transactional mode it is the responsibility of the user
211 /// to ensure non-concurrent file mutation.
213 /// This function will call File::sync().
215 /// It is an error to call this function on an allocator that is not
216 /// attached to a file. Doing so will result in undefined behavior.
217 void resize_file(size_t new_file_size);
219 /// Reserve disk space now to avoid allocation errors at a later point in
220 /// time, and to minimize on-disk fragmentation. In some cases, less
221 /// fragmentation translates into improved performance. On SSD-drives
222 /// preallocation is likely a waste.
224 /// When supported by the system, a call to this function will make the
225 /// database file at least as big as the specified size, and cause space on
226 /// the target device to be allocated (note that on many systems on-disk
227 /// allocation is done lazily by default). If the file is already bigger
228 /// than the specified size, the size will be unchanged, and on-disk
229 /// allocation will occur only for the initial section that corresponds to
230 /// the specified size. On systems that do not support preallocation, this
231 /// function has no effect. To know whether preallocation is supported by
232 /// Realm on your platform, call util::File::is_prealloc_supported().
234 /// This function will call File::sync() if it changes the size of the file.
236 /// It is an error to call this function on an allocator that is not
237 /// attached to a file. Doing so will result in undefined behavior.
238 void reserve_disk_space(size_t size_in_bytes);
240 /// Get the size of the attached database file or buffer in number
241 /// of bytes. This size is not affected by new allocations. After
242 /// attachment, it can only be modified by a call to update_reader_view().
244 /// It is an error to call this function on a detached allocator,
245 /// or one that was attached using attach_empty(). Doing so will
246 /// result in undefined behavior.
247 size_t get_baseline() const noexcept;
249 /// Get the total amount of managed memory. This is the baseline plus the
250 /// sum of the sizes of the allocated slabs. It includes any free space.
252 /// It is an error to call this function on a detached
253 /// allocator. Doing so will result in undefined behavior.
254 size_t get_total_size() const noexcept;
256 /// Mark all mutable memory (ref-space outside the attached file) as free
258 void reset_free_space_tracking();
260 /// Update the readers view of the file:
262 /// Remap the attached file such that a prefix of the specified
263 /// size becomes available in memory. If sucessfull,
264 /// get_baseline() will return the specified new file size.
266 /// It is an error to call this function on a detached allocator,
267 /// or one that was not attached using attach_file(). Doing so
268 /// will result in undefined behavior.
270 /// The file_size argument must be aligned to a *section* boundary:
271 /// The database file is logically split into sections, each section
272 /// guaranteed to be mapped as a contiguous address range. The allocation
273 /// of memory in the file must ensure that no allocation crosses the
274 /// boundary between two sections.
276 /// Clears any allocator specicific caching of address translations
277 /// and force any later address translations to trigger decryption if required.
278 void update_reader_view(size_t file_size);
280 /// Returns true initially, and after a call to reset_free_space_tracking()
281 /// up until the point of the first call to SlabAlloc::alloc(). Note that a
282 /// call to SlabAlloc::alloc() corresponds to a mutation event.
283 bool is_free_space_clean() const noexcept;
285 void verify() const override;
287 void enable_debug(bool enable)
289 m_debug_out = enable;
291 bool is_all_free() const;
297 MemRef do_alloc(const size_t size) override;
298 MemRef do_realloc(ref_type, const char*, size_t old_size, size_t new_size) override;
299 // FIXME: It would be very nice if we could detect an invalid free operation in debug mode
300 void do_free(ref_type, const char*) noexcept override;
301 char* do_translate(ref_type) const noexcept override;
303 /// Returns the first section boundary *above* the given position.
304 size_t get_upper_section_boundary(size_t start_pos) const noexcept;
306 /// Returns the first section boundary *at or below* the given position.
307 size_t get_lower_section_boundary(size_t start_pos) const noexcept;
309 /// Returns true if the given position is at a section boundary
310 bool matches_section_boundary(size_t pos) const noexcept;
312 /// Returns the index of the section holding a given address.
313 /// The section index is determined solely by the minimal section size,
314 /// and does not necessarily reflect the mapping. A mapping may
315 /// cover multiple sections - the initial mapping often does.
316 size_t get_section_index(size_t pos) const noexcept;
318 /// Reverse: get the base offset of a section at a given index. Since the
319 /// computation is very time critical, this method just looks it up in
320 /// a table. The actual computation and setup of that table is done
321 /// during initialization with the help of compute_section_base() below.
322 inline size_t get_section_base(size_t index) const noexcept;
324 /// Actually compute the starting offset of a section. Only used to initialize
325 /// a table of predefined results, which are then used by get_section_base().
326 size_t compute_section_base(size_t index) const noexcept;
328 /// Find a possible allocation of 'request_size' that will fit into a section
329 /// which is inside the range from 'start_pos' to 'start_pos'+'free_chunk_size'
330 /// If found return the position, if not return 0.
331 size_t find_section_in_range(size_t start_pos, size_t free_chunk_size, size_t request_size) const noexcept;
334 void internal_invalidate_cache() noexcept;
336 attach_None, // Nothing is attached
337 attach_OwnedBuffer, // We own the buffer (m_data = nullptr for empty buffer)
338 attach_UsersBuffer, // We do not own the buffer
339 attach_SharedFile, // On behalf of SharedGroup
340 attach_UnsharedFile // Not on behalf of SharedGroup
343 // A slab is a dynamically allocated contiguous chunk of memory used to
344 // extend the amount of space available for database node
345 // storage. Inter-node references are represented as file offsets
346 // (a.k.a. "refs"), and each slab creates an apparently seamless extension
347 // of this file offset addressable space. Slabes are stored as rows in the
348 // Slabs table in order of ascending file offsets.
358 // Values of each used bit in m_flags
365 uint64_t m_top_ref[2]; // 2 * 8 bytes
366 // Info-block 8-bytes
367 uint8_t m_mnemonic[4]; // "T-DB"
368 uint8_t m_file_format[2]; // See `library_file_format`
370 // bit 0 of m_flags is used to select between the two top refs.
375 struct StreamingFooter {
377 uint64_t m_magic_cookie;
380 static_assert(sizeof(Header) == 24, "Bad header size");
381 static_assert(sizeof(StreamingFooter) == 16, "Bad footer size");
383 static const Header empty_file_header;
384 static void init_streaming_header(Header*, int file_format_version);
386 static const uint_fast64_t footer_magic_cookie = 0x3034125237E526C8ULL;
388 // The mappings are shared, if they are from a file
389 std::shared_ptr<MappedFile> m_file_mappings;
391 // We are caching local copies of all the additional mappings to allow
392 // for lock-free lookup during ref->address translation (we do not need
393 // to cache the first mapping, because it is immutable) (well, all the
394 // mappings are immutable, but the array holding them is not - it may
395 // have to be relocated)
396 std::unique_ptr<std::shared_ptr<const util::File::Map<char>>[]> m_local_mappings;
397 size_t m_num_local_mappings = 0;
399 const char* m_data = nullptr;
400 size_t m_initial_chunk_size = 0;
401 size_t m_initial_section_size = 0;
402 int m_section_shifts = 0;
403 std::unique_ptr<size_t[]> m_section_bases;
404 size_t m_num_section_bases = 0;
405 AttachMode m_attach_mode = attach_None;
406 enum FeeeSpaceState {
412 /// When set to free_space_Invalid, the free lists are no longer
413 /// up-to-date. This happens if do_free() or
414 /// reset_free_space_tracking() fails, presumably due to
415 /// std::bad_alloc being thrown during updating of the free space
416 /// list. In this this case, alloc(), realloc_(), and
417 /// get_free_read_only() must throw. This member is deliberately
418 /// placed here (after m_attach_mode) in the hope that it leads to
419 /// less padding between members due to alignment requirements.
420 FeeeSpaceState m_free_space_state = free_space_Clean;
422 typedef std::vector<Slab> slabs;
423 typedef std::vector<Chunk> chunks;
426 chunks m_free_read_only;
428 bool m_debug_out = false;
431 const char* addr = nullptr;
434 mutable hash_entry cache[256];
435 mutable size_t version = 1;
437 /// Throws if free-lists are no longer valid.
438 void consolidate_free_read_only();
439 /// Throws if free-lists are no longer valid.
440 const chunks& get_free_read_only() const;
442 /// Throws InvalidDatabase if the file is not a Realm file, if the file is
443 /// corrupted, or if the specified encryption key is incorrect. This
444 /// function will not detect all forms of corruption, though.
445 void validate_buffer(const char* data, size_t len, const std::string& path);
447 static bool is_file_on_streaming_form(const Header& header);
448 /// Read the top_ref from the given buffer and set m_file_on_streaming_form
449 /// if the buffer contains a file in streaming form
450 static ref_type get_top_ref(const char* data, size_t len);
455 static bool ref_less_than_slab_ref_end(ref_type, const Slab&) noexcept;
457 Replication* get_replication() const noexcept
459 return m_replication;
461 void set_replication(Replication* r) noexcept
467 friend class SharedGroup;
468 friend class GroupWriter;
471 inline void SlabAlloc::internal_invalidate_cache() noexcept
476 class SlabAlloc::DetachGuard {
478 DetachGuard(SlabAlloc& alloc) noexcept
482 ~DetachGuard() noexcept;
483 SlabAlloc* release() noexcept;
492 struct InvalidDatabase : util::File::AccessError {
493 InvalidDatabase(const std::string& msg, const std::string& path)
494 : util::File::AccessError(msg, path)
499 inline void SlabAlloc::own_buffer() noexcept
501 REALM_ASSERT_3(m_attach_mode, ==, attach_UsersBuffer);
502 REALM_ASSERT(m_data);
503 REALM_ASSERT(m_file_mappings == nullptr);
504 m_attach_mode = attach_OwnedBuffer;
507 inline bool SlabAlloc::is_attached() const noexcept
509 return m_attach_mode != attach_None;
512 inline bool SlabAlloc::nonempty_attachment() const noexcept
514 return is_attached() && m_data;
517 inline size_t SlabAlloc::get_baseline() const noexcept
519 REALM_ASSERT_DEBUG(is_attached());
523 inline bool SlabAlloc::is_free_space_clean() const noexcept
525 return m_free_space_state == free_space_Clean;
528 inline SlabAlloc::DetachGuard::~DetachGuard() noexcept
534 inline SlabAlloc* SlabAlloc::DetachGuard::release() noexcept
536 SlabAlloc* alloc = m_alloc;
541 inline bool SlabAlloc::ref_less_than_slab_ref_end(ref_type ref, const Slab& slab) noexcept
543 return ref < slab.ref_end;
546 inline size_t SlabAlloc::get_upper_section_boundary(size_t start_pos) const noexcept
548 return get_section_base(1 + get_section_index(start_pos));
551 inline size_t SlabAlloc::get_lower_section_boundary(size_t start_pos) const noexcept
553 return get_section_base(get_section_index(start_pos));
556 inline bool SlabAlloc::matches_section_boundary(size_t pos) const noexcept
558 return pos == get_lower_section_boundary(pos);
561 inline size_t SlabAlloc::get_section_base(size_t index) const noexcept
563 return m_section_bases[index];
568 #endif // REALM_ALLOC_SLAB_HPP