From a852c207738838ff2d81217075f9f658c567cda7 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Apr 2026 16:57:00 -0500 Subject: [PATCH] Migrate memory resources from rmm::mr::device_memory_resource to CCCL resource concept RMM 26.06 removes rmm::mr::device_memory_resource and adopts CCCL-native memory resource concepts. This migrates all cuCascade resource classes: - null_device_memory_resource: replaced virtual override with CCCL concept methods (allocate/deallocate with cuda::stream_ref, allocate_sync, deallocate_sync, operator==, get_property). - numa_region_pinned_host_allocator: same pattern. - fixed_size_host_memory_resource: same pattern. - reservation_aware_resource_adaptor: same pattern. - small_pinned_host_memory_resource: same pattern. - memory_space: unique_ptr replaced with cuda::mr::any_resource for owning storage; get_default_allocator returns device_async_resource_ref from any_resource. - memory_reservation.hpp: moved template bodies out-of-line to avoid incomplete-type errors with memory_space forward declaration. - common.hpp/cpp: updated factory typedefs and pool_memory_resource usage (no longer a template in 26.06). - pool_memory_resource upstream passed as device_async_resource_ref. - Test utilities updated for new resource concept. --- include/cucascade/data/common.hpp | 1 - include/cucascade/memory/common.hpp | 11 +-- .../fixed_size_host_memory_resource.hpp | 63 ++++++++--------- .../cucascade/memory/memory_reservation.hpp | 29 +++----- .../memory/memory_reservation_manager.hpp | 4 +- include/cucascade/memory/memory_space.hpp | 27 ++++++-- .../memory/null_device_memory_resource.hpp | 48 +++++++++---- .../numa_region_pinned_host_allocator.hpp | 59 +++++----------- .../reservation_aware_resource_adaptor.hpp | 68 ++++++++++--------- .../small_pinned_host_memory_resource.hpp | 47 ++++++------- src/data/representation_converter.cpp | 33 +++++---- src/memory/common.cpp | 16 +++-- .../fixed_size_host_memory_resource.cpp | 17 ++--- src/memory/memory_reservation.cpp | 2 +- src/memory/memory_space.cpp | 36 ++++------ .../numa_region_pinned_host_allocator.cpp | 34 +++++++--- .../reservation_aware_resource_adaptor.cpp | 49 ++++++------- .../reservation_manager_configurator.cpp | 5 +- .../small_pinned_host_memory_resource.cpp | 19 +++--- .../test_memory_reservation_manager.cpp | 3 +- ...test_small_pinned_host_memory_resource.cpp | 4 +- test/unittest.cpp | 11 ++- test/utils/cudf_test_utils.cpp | 55 ++++++++------- test/utils/mock_test_utils.hpp | 23 ++++--- test/utils/test_memory_resources.hpp | 42 ++++++------ 25 files changed, 364 insertions(+), 342 deletions(-) diff --git a/include/cucascade/data/common.hpp b/include/cucascade/data/common.hpp index 6e3e623..6bd0ad0 100644 --- a/include/cucascade/data/common.hpp +++ b/include/cucascade/data/common.hpp @@ -20,7 +20,6 @@ #include #include -#include #include #include diff --git a/include/cucascade/memory/common.hpp b/include/cucascade/memory/common.hpp index 988aee0..9975a6c 100644 --- a/include/cucascade/memory/common.hpp +++ b/include/cucascade/memory/common.hpp @@ -18,7 +18,8 @@ #pragma once #include -#include + +#include #include #include @@ -60,13 +61,13 @@ class memory_space_id { }; using DeviceMemoryResourceFactoryFn = - std::function(int device_id, - std::size_t capacity)>; + std::function(int device_id, + std::size_t capacity)>; -std::unique_ptr make_default_gpu_memory_resource( +cuda::mr::any_resource make_default_gpu_memory_resource( int device_id, std::size_t capacity); -std::unique_ptr make_default_host_memory_resource( +cuda::mr::any_resource make_default_host_memory_resource( int device_id, std::size_t capacity); DeviceMemoryResourceFactoryFn make_default_allocator_for_tier(Tier tier); diff --git a/include/cucascade/memory/fixed_size_host_memory_resource.hpp b/include/cucascade/memory/fixed_size_host_memory_resource.hpp index f211a6f..99c58ee 100644 --- a/include/cucascade/memory/fixed_size_host_memory_resource.hpp +++ b/include/cucascade/memory/fixed_size_host_memory_resource.hpp @@ -26,8 +26,6 @@ #include #include #include -#include -#include #include #include @@ -65,7 +63,7 @@ namespace memory { * Modified to derive from device_memory_resource instead of host_memory_resource for RMM * compatibility. */ -class fixed_size_host_memory_resource : public rmm::mr::device_memory_resource { +class fixed_size_host_memory_resource { public: static constexpr std::size_t default_block_size = 1 << 20; ///< Default block size (1MB) static constexpr std::size_t default_pool_size = 128; ///< Default number of blocks in pool @@ -234,7 +232,7 @@ class fixed_size_host_memory_resource : public rmm::mr::device_memory_resource { /** * @brief Destructor - frees all allocated blocks. */ - ~fixed_size_host_memory_resource() override; + ~fixed_size_host_memory_resource(); [[nodiscard]] std::size_t get_total_allocated_bytes() const noexcept { @@ -322,6 +320,34 @@ class fixed_size_host_memory_resource : public rmm::mr::device_memory_resource { */ std::size_t get_peak_total_allocated_bytes() const; + void* allocate(cuda::stream_ref stream, + std::size_t bytes, + std::size_t alignment = alignof(std::max_align_t)); + + void deallocate(cuda::stream_ref stream, + void* ptr, + std::size_t bytes, + std::size_t alignment = alignof(std::max_align_t)) noexcept; + + void* allocate_sync(std::size_t bytes, std::size_t alignment = alignof(std::max_align_t)) + { + return allocate(cuda::stream_ref{}, bytes, alignment); + } + + void deallocate_sync(void* ptr, + std::size_t bytes, + std::size_t alignment = alignof(std::max_align_t)) noexcept + { + deallocate(cuda::stream_ref{}, ptr, bytes, alignment); + } + + [[nodiscard]] bool operator==(fixed_size_host_memory_resource const& other) const noexcept; + + friend void get_property(fixed_size_host_memory_resource const&, + cuda::mr::device_accessible) noexcept + { + } + protected: /** * @brief grows reservation by a `bytes` size @@ -340,35 +366,6 @@ class fixed_size_host_memory_resource : public rmm::mr::device_memory_resource { std::size_t do_reserve_upto(std::size_t bytes, std::size_t mem_limit); - /** - * @brief Allocate memory of the specified size. - * - * @param bytes Size in bytes (must be <= block_size_) - * @param stream CUDA stream (ignored for host memory) - * @return void* Pointer to allocated memory - * @throws rmm::logic_error if allocation size exceeds block size - * @throws rmm::out_of_memory if no free blocks are available and upstream allocation fails - */ - void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override; - - /** - * @brief Deallocate memory. - * - * @param ptr Pointer to deallocate - * @param bytes Size in bytes (must be <= block_size_) - * @param stream CUDA stream (ignored for host memory) - */ - void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) noexcept override; - - /** - * @brief Check if this resource is equal to another. - * - * @param other Other resource to compare - * @return bool True if equal - */ - [[nodiscard]] bool do_is_equal( - const rmm::mr::device_memory_resource& other) const noexcept override; - private: /** * @brief Expand the pool by allocating more blocks from upstream. diff --git a/include/cucascade/memory/memory_reservation.hpp b/include/cucascade/memory/memory_reservation.hpp index b881da3..e4371f4 100644 --- a/include/cucascade/memory/memory_reservation.hpp +++ b/include/cucascade/memory/memory_reservation.hpp @@ -22,7 +22,7 @@ #include #include -#include +#include #include #include @@ -42,23 +42,19 @@ class memory_space; template struct tier_memory_resource_trait { - using upstream_type = rmm::mr::device_memory_resource; - using type = rmm::mr::device_memory_resource; - Tier tier = TIER; + Tier tier = TIER; }; template <> struct tier_memory_resource_trait { - using upstream_type = rmm::mr::device_memory_resource; - using type = fixed_size_host_memory_resource; - Tier tier = Tier::HOST; + using type = fixed_size_host_memory_resource; + Tier tier = Tier::HOST; }; template <> struct tier_memory_resource_trait { - using upstream_type = rmm::mr::device_memory_resource; - using type = reservation_aware_resource_adaptor; - Tier tier = Tier::GPU; + using type = reservation_aware_resource_adaptor; + Tier tier = Tier::GPU; }; //===----------------------------------------------------------------------===// @@ -216,22 +212,15 @@ class reservation { [[nodiscard]] int device_id() const noexcept; - [[nodiscard]] rmm::mr::device_memory_resource* get_memory_resource() const noexcept; + [[nodiscard]] rmm::device_async_resource_ref get_memory_resource() const noexcept; [[nodiscard]] const memory_space& get_memory_space() const noexcept; template - requires std::derived_from - T* get_memory_resource_as() const noexcept - { - return dynamic_cast(get_memory_resource()); - } + T* get_memory_resource_as() const noexcept; template - auto* get_memory_resource_of() const noexcept - { - return get_memory_resource_as::type>(); - } + auto* get_memory_resource_of() const noexcept; //===----------------------------------------------------------------------===// // Reservation Size Management diff --git a/include/cucascade/memory/memory_reservation_manager.hpp b/include/cucascade/memory/memory_reservation_manager.hpp index fc8e09f..1beb3fc 100644 --- a/include/cucascade/memory/memory_reservation_manager.hpp +++ b/include/cucascade/memory/memory_reservation_manager.hpp @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -34,9 +35,6 @@ #include #include -#include -#include - namespace cucascade { namespace memory { diff --git a/include/cucascade/memory/memory_space.hpp b/include/cucascade/memory/memory_space.hpp index 96d2855..8ddcde8 100644 --- a/include/cucascade/memory/memory_space.hpp +++ b/include/cucascade/memory/memory_space.hpp @@ -33,7 +33,6 @@ #include #include #include -#include #include namespace cucascade { @@ -100,13 +99,19 @@ class memory_space { [[nodiscard]] size_t get_max_memory() const noexcept; // Allocator management - [[nodiscard]] rmm::mr::device_memory_resource* get_default_allocator() const noexcept; + [[nodiscard]] rmm::device_async_resource_ref get_default_allocator() const noexcept; template - requires std::derived_from T* get_memory_resource_as() const noexcept { - return dynamic_cast(get_default_allocator()); + T* result = nullptr; + std::visit( + [&result](const auto& ptr) { + using held_type = std::decay_t; + if constexpr (std::is_same_v) { result = ptr.get(); } + }, + _reservation_allocator); + return result; } template @@ -136,7 +141,7 @@ class memory_space { std::make_shared(); // Memory resources owned by this memory_space - std::unique_ptr _allocator; + cuda::mr::any_resource _allocator; reserving_adaptor_type _reservation_allocator; std::unique_ptr _stream_pool; }; @@ -149,5 +154,17 @@ struct memory_space_hash { size_t operator()(const memory_space& ms) const; }; +template +T* reservation::get_memory_resource_as() const noexcept +{ + return _space->get_memory_resource_as(); +} + +template +auto* reservation::get_memory_resource_of() const noexcept +{ + return get_memory_resource_as::type>(); +} + } // namespace memory } // namespace cucascade diff --git a/include/cucascade/memory/null_device_memory_resource.hpp b/include/cucascade/memory/null_device_memory_resource.hpp index a523fc0..05390aa 100644 --- a/include/cucascade/memory/null_device_memory_resource.hpp +++ b/include/cucascade/memory/null_device_memory_resource.hpp @@ -17,38 +17,58 @@ #pragma once -#include -#include +#include +#include + +#include namespace cucascade { namespace memory { /** - * A no-op device_memory_resource used for DISK tier to satisfy API requirements. + * A no-op memory resource used for DISK tier to satisfy API requirements. * - allocate always returns nullptr * - deallocate is a no-op */ -class null_device_memory_resource : public rmm::mr::device_memory_resource { +class null_device_memory_resource { public: - null_device_memory_resource() = default; - ~null_device_memory_resource() override = default; + null_device_memory_resource() = default; + ~null_device_memory_resource() = default; - protected: - void* do_allocate([[maybe_unused]] std::size_t bytes, - [[maybe_unused]] rmm::cuda_stream_view stream) override + void* allocate([[maybe_unused]] cuda::stream_ref stream, + [[maybe_unused]] std::size_t bytes, + [[maybe_unused]] std::size_t alignment = alignof(std::max_align_t)) { return nullptr; } - void do_deallocate([[maybe_unused]] void* p, - [[maybe_unused]] std::size_t bytes, - [[maybe_unused]] rmm::cuda_stream_view stream) noexcept override + + void deallocate([[maybe_unused]] cuda::stream_ref stream, + [[maybe_unused]] void* p, + [[maybe_unused]] std::size_t bytes, + [[maybe_unused]] std::size_t alignment = alignof(std::max_align_t)) noexcept { } - [[nodiscard]] bool do_is_equal( - const rmm::mr::device_memory_resource& other) const noexcept override + + void* allocate_sync([[maybe_unused]] std::size_t bytes, + [[maybe_unused]] std::size_t alignment = alignof(std::max_align_t)) + { + return nullptr; + } + + void deallocate_sync([[maybe_unused]] void* p, + [[maybe_unused]] std::size_t bytes, + [[maybe_unused]] std::size_t alignment = alignof(std::max_align_t)) noexcept + { + } + + bool operator==(null_device_memory_resource const& other) const noexcept { return this == &other; } + + friend void get_property(null_device_memory_resource const&, cuda::mr::device_accessible) noexcept + { + } }; } // namespace memory diff --git a/include/cucascade/memory/numa_region_pinned_host_allocator.hpp b/include/cucascade/memory/numa_region_pinned_host_allocator.hpp index a5f66b5..03ad24d 100644 --- a/include/cucascade/memory/numa_region_pinned_host_allocator.hpp +++ b/include/cucascade/memory/numa_region_pinned_host_allocator.hpp @@ -17,8 +17,7 @@ #pragma once -#include - +#include #include #include @@ -26,7 +25,7 @@ namespace cucascade { namespace memory { -class numa_region_pinned_host_memory_resource final : public rmm::mr::device_memory_resource { +class numa_region_pinned_host_memory_resource final { public: explicit numa_region_pinned_host_memory_resource(int numa_node) : _numa_node(numa_node) {} ~numa_region_pinned_host_memory_resource() = default; @@ -37,53 +36,32 @@ class numa_region_pinned_host_memory_resource final : public rmm::mr::device_mem numa_region_pinned_host_memory_resource& operator=(numa_region_pinned_host_memory_resource&&) = default; - private: /** * @brief Allocates pinned host memory of size at least \p bytes bytes. - * - * @throws rmm::out_of_memory if the requested allocation could not be fulfilled due to a - * CUDA out of memory error. - * @throws rmm::bad_alloc if the requested allocation could not be fulfilled due to any other - * reason. - * - * The stream argument is ignored. - * - * @param bytes The size, in bytes, of the allocation. - * @param stream CUDA stream on which to perform the allocation (ignored). - * - * @return Pointer to the newly allocated memory. */ - void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override; + void* allocate(cuda::stream_ref stream, + std::size_t bytes, + std::size_t alignment = alignof(std::max_align_t)); /** * @brief Deallocate memory pointed to by \p ptr. - * - * The stream argument is ignored. - * - * @param ptr Pointer to be deallocated - * @param bytes The size in bytes of the allocation. This must be equal to the - * value of `bytes` that was passed to the `allocate` call that returned `ptr`. - * @param stream This argument is ignored. */ - void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) noexcept override; + void deallocate(cuda::stream_ref stream, + void* ptr, + std::size_t bytes, + std::size_t alignment = alignof(std::max_align_t)) noexcept; - /** - * @brief Compare this resource to another. - * - * Two pinned_host_memory_resources always compare equal, because they can each - * deallocate memory allocated by the other. - * - * @param other The other resource to compare to - * @return true If the two resources are equivalent - * @return false If the two resources are not equal - */ - [[nodiscard]] bool do_is_equal( - const rmm::mr::device_memory_resource& other) const noexcept override; + void* allocate_sync(std::size_t bytes, std::size_t alignment = alignof(std::max_align_t)); + + void deallocate_sync(void* ptr, + std::size_t bytes, + std::size_t alignment = alignof(std::max_align_t)) noexcept; + + [[nodiscard]] bool operator==( + numa_region_pinned_host_memory_resource const& other) const noexcept; /** * @brief Enables the `cuda::mr::device_accessible` property - * - * This property declares that a `pinned_host_memory_resource` provides device accessible memory */ friend void get_property(numa_region_pinned_host_memory_resource const&, cuda::mr::device_accessible) noexcept @@ -92,14 +70,13 @@ class numa_region_pinned_host_memory_resource final : public rmm::mr::device_mem /** * @brief Enables the `cuda::mr::host_accessible` property - * - * This property declares that a `pinned_host_memory_resource` provides host accessible memory */ friend void get_property(numa_region_pinned_host_memory_resource const&, cuda::mr::host_accessible) noexcept { } + private: int _numa_node{-1}; }; diff --git a/include/cucascade/memory/reservation_aware_resource_adaptor.hpp b/include/cucascade/memory/reservation_aware_resource_adaptor.hpp index 73f5de3..ccca680 100644 --- a/include/cucascade/memory/reservation_aware_resource_adaptor.hpp +++ b/include/cucascade/memory/reservation_aware_resource_adaptor.hpp @@ -18,9 +18,9 @@ #pragma once #include -#include #include +#include #include #include @@ -54,7 +54,7 @@ namespace memory { * * Based on RMM's tracking_resource_adaptor but extended for per-stream tracking. */ -class reservation_aware_resource_adaptor : public rmm::mr::device_memory_resource { +class reservation_aware_resource_adaptor { public: struct device_reserved_arena : public reserved_arena { friend class reservation_aware_resource_adaptor; @@ -163,8 +163,8 @@ class reservation_aware_resource_adaptor : public rmm::mr::device_memory_resourc std::size_t capacity, std::unique_ptr stream_reservation_policy = nullptr, std::unique_ptr default_oom_policy = nullptr, - AllocationTrackingScope tracking_scope = AllocationTrackingScope::PER_STREAM, - cudaMemPool_t pool_handle = nullptr); + AllocationTrackingScope tracking_scope = AllocationTrackingScope::PER_STREAM, + cudaMemPool_t pool_handle = nullptr); /** * @brief Constructs a per-stream tracking resource adaptor. @@ -186,13 +186,13 @@ class reservation_aware_resource_adaptor : public rmm::mr::device_memory_resourc std::size_t capacity, std::unique_ptr stream_reservation_policy = nullptr, std::unique_ptr default_oom_policy = nullptr, - AllocationTrackingScope tracking_scope = AllocationTrackingScope::PER_STREAM, - cudaMemPool_t pool_handle = nullptr); + AllocationTrackingScope tracking_scope = AllocationTrackingScope::PER_STREAM, + cudaMemPool_t pool_handle = nullptr); /** * @brief Destructor. */ - ~reservation_aware_resource_adaptor() override = default; + ~reservation_aware_resource_adaptor() = default; // Non-copyable and non-movable to ensure resource stability reservation_aware_resource_adaptor(const reservation_aware_resource_adaptor&) = delete; @@ -326,6 +326,34 @@ class reservation_aware_resource_adaptor : public rmm::mr::device_memory_resourc */ const oom_handling_policy& get_default_oom_handling_policy() const; + void* allocate(cuda::stream_ref stream, + std::size_t bytes, + std::size_t alignment = alignof(std::max_align_t)); + + void deallocate(cuda::stream_ref stream, + void* ptr, + std::size_t bytes, + std::size_t alignment = alignof(std::max_align_t)) noexcept; + + void* allocate_sync(std::size_t bytes, std::size_t alignment = alignof(std::max_align_t)) + { + return allocate(cuda::stream_ref{}, bytes, alignment); + } + + void deallocate_sync(void* ptr, + std::size_t bytes, + std::size_t alignment = alignof(std::max_align_t)) noexcept + { + deallocate(cuda::stream_ref{}, ptr, bytes, alignment); + } + + bool operator==(reservation_aware_resource_adaptor const& other) const noexcept; + + friend void get_property(reservation_aware_resource_adaptor const&, + cuda::mr::device_accessible) noexcept + { + } + private: /** * @brief grows reservation by a `bytes` size @@ -396,32 +424,6 @@ class reservation_aware_resource_adaptor : public rmm::mr::device_memory_resourc */ void do_release_reservation(device_reserved_arena* reservation) noexcept; - /** - * @brief Allocates memory from the upstream resource and tracks it. - * - * @param bytes The number of bytes to allocate - * @param stream The CUDA stream to use for the allocation - * @return Pointer to allocated memory - */ - void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override; - - /** - * @brief Deallocates previously allocated memory and updates tracking. - * - * @param ptr Pointer to memory to deallocate - * @param bytes The number of bytes that were allocated - * @param stream The CUDA stream to use for the deallocation - */ - void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) noexcept override; - - /** - * @brief Checks equality with another memory resource. - * - * @param other The other memory resource to compare with - * @return true if this resource is the same as other - */ - bool do_is_equal(const rmm::mr::device_memory_resource& other) const noexcept override; - memory_space_id _space_id; /// The upstream memory resource diff --git a/include/cucascade/memory/small_pinned_host_memory_resource.hpp b/include/cucascade/memory/small_pinned_host_memory_resource.hpp index ddd6c35..003d606 100644 --- a/include/cucascade/memory/small_pinned_host_memory_resource.hpp +++ b/include/cucascade/memory/small_pinned_host_memory_resource.hpp @@ -19,8 +19,8 @@ #include -#include -#include +#include +#include #include #include @@ -53,7 +53,7 @@ namespace memory { * This eliminates the pageable H2D transfers that cuDF would otherwise issue * when building column_device_view metadata arrays for cudf::concatenate. */ -class small_pinned_host_memory_resource : public rmm::mr::device_memory_resource { +class small_pinned_host_memory_resource { public: /// Maximum allocation size handled by the slab pools. /// Requests larger than this use pageable memory. @@ -72,35 +72,36 @@ class small_pinned_host_memory_resource : public rmm::mr::device_memory_resource small_pinned_host_memory_resource(small_pinned_host_memory_resource&&) = delete; small_pinned_host_memory_resource& operator=(small_pinned_host_memory_resource&&) = delete; - ~small_pinned_host_memory_resource() override; + ~small_pinned_host_memory_resource(); - private: /** * @brief Allocate pinned memory. - * - * For @p bytes <= MAX_SLAB_SIZE: rounds up to the next slab boundary - * (512 / 1 KB / 2 KB / 4 KB / 8 KB) and returns a pointer from the matching - * free list, expanding the pool from upstream if the list is empty. - * - * For @p bytes > MAX_SLAB_SIZE: falls back to std::malloc (pageable). - * The cudf::set_allocate_host_as_pinned_threshold is set to MAX_SLAB_SIZE so - * that cuDF's make_host_vector path uses the slab pools for metadata buffers. - * Larger allocations (e.g. join/sort staging buffers that call - * get_pinned_memory_resource() directly) are served from pageable memory. */ - void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override; + void* allocate(cuda::stream_ref stream, + std::size_t bytes, + std::size_t alignment = alignof(std::max_align_t)); /** * @brief Return memory to the appropriate pool. - * - * Slabs (@p bytes <= MAX_SLAB_SIZE) are returned to the free list. - * Pageable allocations (@p bytes > MAX_SLAB_SIZE) are freed via std::free. - * @p bytes must equal the value passed to the corresponding do_allocate. */ - void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) noexcept override; + void deallocate(cuda::stream_ref stream, + void* ptr, + std::size_t bytes, + std::size_t alignment = alignof(std::max_align_t)) noexcept; + + void* allocate_sync(std::size_t bytes, std::size_t alignment = alignof(std::max_align_t)) + { + return allocate(cuda::stream_ref{}, bytes, alignment); + } + + void deallocate_sync(void* ptr, + std::size_t bytes, + std::size_t alignment = alignof(std::max_align_t)) noexcept + { + deallocate(cuda::stream_ref{}, ptr, bytes, alignment); + } - [[nodiscard]] bool do_is_equal( - const rmm::mr::device_memory_resource& other) const noexcept override; + bool operator==(small_pinned_host_memory_resource const& other) const noexcept; /** * @brief Declares that memory allocated here is accessible from GPU devices. diff --git a/src/data/representation_converter.cpp b/src/data/representation_converter.cpp index 117eb75..e1bd351 100644 --- a/src/data/representation_converter.cpp +++ b/src/data/representation_converter.cpp @@ -17,6 +17,7 @@ #include "cudf/contiguous_split.hpp" +#include #include #include #include @@ -32,10 +33,9 @@ #include #include -#include - #include #include +#include #include @@ -156,11 +156,11 @@ std::unique_ptr convert_gpu_to_gpu( // Asynchronously copy device->device across GPUs CUCASCADE_CUDA_TRY(cudaMemcpyPeerAsync(dst_uvector.data(), - target_device_id, - static_cast(packed_data.gpu_data->data()), - source_device_id, - bytes_to_copy, - stream.value())); + target_device_id, + static_cast(packed_data.gpu_data->data()), + source_device_id, + bytes_to_copy, + stream.value())); stream.synchronize(); // Unpack on target device to build a cudf::table that lives on the target GPU CUCASCADE_CUDA_TRY(cudaSetDevice(target_device_id)); @@ -254,10 +254,10 @@ std::unique_ptr convert_host_to_gpu( size_t bytes_to_copy = std::min(remaining_bytes, bytes_available_in_src_block); auto src_block = host_table->allocation->at(src_block_index); CUCASCADE_CUDA_TRY(cudaMemcpyAsync(static_cast(dst_buffer.data()) + dst_offset, - src_block.data() + src_block_offset, - bytes_to_copy, - cudaMemcpyHostToDevice, - stream.value())); + src_block.data() + src_block_offset, + bytes_to_copy, + cudaMemcpyHostToDevice, + stream.value())); dst_offset += bytes_to_copy; src_block_offset += bytes_to_copy; if (src_block_offset == src_block_size) { @@ -417,7 +417,8 @@ struct BatchCopyAccumulator { // cudaMemcpyBatchAsync requires CUDA 12.8+; fall back to individual copies. (void)src_order; for (std::size_t i = 0; i < count(); ++i) { - CUCASCADE_CUDA_TRY(cudaMemcpyAsync(dsts[i], srcs[i], sizes[i], cudaMemcpyDefault, stream.value())); + CUCASCADE_CUDA_TRY( + cudaMemcpyAsync(dsts[i], srcs[i], sizes[i], cudaMemcpyDefault, stream.value())); } #endif } @@ -613,7 +614,7 @@ static rmm::device_buffer alloc_and_schedule_h2d(memory::fixed_multiple_blocks_a std::size_t alloc_offset, std::size_t size, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr, + rmm::device_async_resource_ref mr, BatchCopyAccumulator& batch) { rmm::device_buffer buf(size, stream, mr); @@ -655,7 +656,7 @@ static std::unique_ptr reconstruct_column( const memory::column_metadata& meta, memory::fixed_multiple_blocks_allocation& alloc, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr, + rmm::device_async_resource_ref mr, BatchCopyAccumulator& batch) { // Null mask (shared by all type categories) @@ -729,9 +730,7 @@ std::unique_ptr convert_host_fast_to_gpu( { auto& fast_source = source.cast(); const auto& fast_table = fast_source.get_host_table(); - if (!fast_table) { - throw std::runtime_error("convert_host_fast_to_gpu: host table is null"); - } + if (!fast_table) { throw std::runtime_error("convert_host_fast_to_gpu: host table is null"); } if (!fast_table->allocation) { throw std::runtime_error("convert_host_fast_to_gpu: host table allocation is null"); } diff --git a/src/memory/common.cpp b/src/memory/common.cpp index 1cb1eba..fe278d3 100644 --- a/src/memory/common.cpp +++ b/src/memory/common.cpp @@ -26,17 +26,19 @@ namespace cucascade { namespace memory { -std::unique_ptr make_default_gpu_memory_resource(int device_id, - size_t capacity) +cuda::mr::any_resource make_default_gpu_memory_resource( + int device_id, size_t capacity) { rmm::cuda_set_device_raii set_device(rmm::cuda_device_id{device_id}); - return std::make_unique(capacity); + return cuda::mr::any_resource{ + rmm::mr::cuda_async_memory_resource(capacity)}; } -std::unique_ptr make_default_host_memory_resource( +cuda::mr::any_resource make_default_host_memory_resource( int numa_node_id, [[maybe_unused]] size_t capacity) { - return std::make_unique(numa_node_id); + return cuda::mr::any_resource{ + cucascade::memory::numa_region_pinned_host_memory_resource(numa_node_id)}; } DeviceMemoryResourceFactoryFn make_default_allocator_for_tier(Tier tier) @@ -46,7 +48,9 @@ DeviceMemoryResourceFactoryFn make_default_allocator_for_tier(Tier tier) } else if (tier == Tier::HOST) { return make_default_host_memory_resource; } else { - return [](int, size_t) { return std::make_unique(); }; + return [](int, size_t) { + return cuda::mr::any_resource{null_device_memory_resource{}}; + }; } } diff --git a/src/memory/fixed_size_host_memory_resource.cpp b/src/memory/fixed_size_host_memory_resource.cpp index d53f5d6..cff6a40 100644 --- a/src/memory/fixed_size_host_memory_resource.cpp +++ b/src/memory/fixed_size_host_memory_resource.cpp @@ -23,7 +23,6 @@ #include #include -#include #include #include @@ -188,21 +187,23 @@ std::vector fixed_size_host_memory_resource::allocate_multiple_block return {}; } -void* fixed_size_host_memory_resource::do_allocate(std::size_t /*bytes*/, - rmm::cuda_stream_view /*stream*/) +void* fixed_size_host_memory_resource::allocate([[maybe_unused]] cuda::stream_ref stream, + [[maybe_unused]] std::size_t bytes, + [[maybe_unused]] std::size_t alignment) { throw rmm::logic_error( "fixed_size_host_memory_resource doesn't support allocate, use allocate_multiple_blocks"); } -void fixed_size_host_memory_resource::do_deallocate(void* /*ptr*/, - std::size_t /*bytes*/, - rmm::cuda_stream_view /*stream*/) noexcept +void fixed_size_host_memory_resource::deallocate([[maybe_unused]] cuda::stream_ref stream, + [[maybe_unused]] void* ptr, + [[maybe_unused]] std::size_t bytes, + [[maybe_unused]] std::size_t alignment) noexcept { } -bool fixed_size_host_memory_resource::do_is_equal( - const rmm::mr::device_memory_resource& other) const noexcept +bool fixed_size_host_memory_resource::operator==( + fixed_size_host_memory_resource const& other) const noexcept { return this == &other; } diff --git a/src/memory/memory_reservation.cpp b/src/memory/memory_reservation.cpp index 8e16e8b..58dd9eb 100644 --- a/src/memory/memory_reservation.cpp +++ b/src/memory/memory_reservation.cpp @@ -49,7 +49,7 @@ reservation::reservation(const memory_space* space, std::unique_ptrget_default_allocator(); } diff --git a/src/memory/memory_space.cpp b/src/memory/memory_space.cpp index dff87f4..bb199ff 100644 --- a/src/memory/memory_space.cpp +++ b/src/memory/memory_space.cpp @@ -49,26 +49,25 @@ memory_space::memory_space(const gpu_memory_space_config& config) _memory_limit(config.reservation_limit()), _start_downgrading_memory_threshold(config.downgrade_trigger_threshold()), _stop_downgrading_memory_threshold(config.downgrade_stop_threshold()), - _allocator(config.mr_factory_fn - ? config.mr_factory_fn(config.device_id, config.memory_capacity) - : make_default_gpu_memory_resource(config.device_id, config.memory_capacity)), _stream_pool{[&]() -> std::unique_ptr { rmm::cuda_set_device_raii guard{rmm::cuda_device_id(config.device_id)}; return std::make_unique(16, rmm::cuda_stream::flags::non_blocking); }()} { - if (!_allocator) { throw std::invalid_argument("At least one allocator must be provided"); } - cudaMemPool_t pool_handle{nullptr}; - if (auto* r = dynamic_cast(_allocator.get())) { - pool_handle = r->pool_handle(); - } else if (auto* r = dynamic_cast(_allocator.get())) { - pool_handle = r->pool_handle(); + + if (config.mr_factory_fn) { + _allocator = config.mr_factory_fn(config.device_id, config.memory_capacity); + } else { + rmm::cuda_set_device_raii set_device(rmm::cuda_device_id{config.device_id}); + rmm::mr::cuda_async_memory_resource concrete_mr(config.memory_capacity); + pool_handle = concrete_mr.pool_handle(); + _allocator = cuda::mr::any_resource(std::move(concrete_mr)); } _reservation_allocator = std::make_unique( _id, - *_allocator, + rmm::device_async_resource_ref(_allocator), _memory_limit, _capacity, nullptr, @@ -89,10 +88,9 @@ memory_space::memory_space(const host_memory_space_config& config) ? config.mr_factory_fn(config.numa_id, config.memory_capacity) : make_default_host_memory_resource(config.numa_id, config.memory_capacity)) { - if (!_allocator) { throw std::invalid_argument("At least one allocator must be provided"); } _reservation_allocator = std::make_unique(_id.device_id, - *_allocator, + rmm::device_async_resource_ref(_allocator), _memory_limit, _capacity, config.block_size, @@ -106,7 +104,7 @@ memory_space::memory_space(const disk_memory_space_config& config) _memory_limit(config.reservation_limit()), _start_downgrading_memory_threshold(config.downgrade_trigger_threshold()), _stop_downgrading_memory_threshold(config.downgrade_stop_threshold()), - _allocator(std::make_unique()) + _allocator(null_device_memory_resource{}) { if (config.mount_paths.empty()) { throw std::invalid_argument("Mount path must be provided for disk memory space"); @@ -257,16 +255,10 @@ size_t memory_space::get_total_reserved_memory() const size_t memory_space::get_max_memory() const noexcept { return _memory_limit; } -rmm::mr::device_memory_resource* memory_space::get_default_allocator() const noexcept +rmm::device_async_resource_ref memory_space::get_default_allocator() const noexcept { - return std::visit( - utils::overloaded{[this]([[maybe_unused]] const std::unique_ptr& other) - -> rmm::mr::device_memory_resource* { return _allocator.get(); }, - [](const std::unique_ptr& mr) - -> rmm::mr::device_memory_resource* { return mr.get(); }, - [](const std::unique_ptr& mr) - -> rmm::mr::device_memory_resource* { return mr.get(); }}, - _reservation_allocator); + return rmm::device_async_resource_ref( + const_cast&>(_allocator)); } std::string memory_space::to_string() const diff --git a/src/memory/numa_region_pinned_host_allocator.cpp b/src/memory/numa_region_pinned_host_allocator.cpp index 9a09504..a1492fd 100644 --- a/src/memory/numa_region_pinned_host_allocator.cpp +++ b/src/memory/numa_region_pinned_host_allocator.cpp @@ -15,9 +15,8 @@ * limitations under the License. */ -#include - #include +#include #include @@ -26,8 +25,9 @@ namespace cucascade { namespace memory { -void* numa_region_pinned_host_memory_resource::do_allocate( - std::size_t bytes, [[maybe_unused]] rmm::cuda_stream_view stream) +void* numa_region_pinned_host_memory_resource::allocate([[maybe_unused]] cuda::stream_ref stream, + std::size_t bytes, + [[maybe_unused]] std::size_t alignment) { CUCASCADE_FUNC_RANGE(); // don't allocate anything if the user requested zero bytes @@ -45,8 +45,11 @@ void* numa_region_pinned_host_memory_resource::do_allocate( } } -void numa_region_pinned_host_memory_resource::do_deallocate( - void* ptr, std::size_t bytes, [[maybe_unused]] rmm::cuda_stream_view stream) noexcept +void numa_region_pinned_host_memory_resource::deallocate( + [[maybe_unused]] cuda::stream_ref stream, + void* ptr, + std::size_t bytes, + [[maybe_unused]] std::size_t alignment) noexcept { CUCASCADE_FUNC_RANGE(); if (_numa_node == -1) { @@ -57,11 +60,22 @@ void numa_region_pinned_host_memory_resource::do_deallocate( } } -bool numa_region_pinned_host_memory_resource::do_is_equal( - const rmm::mr::device_memory_resource& other) const noexcept +void* numa_region_pinned_host_memory_resource::allocate_sync(std::size_t bytes, + [[maybe_unused]] std::size_t alignment) +{ + return allocate(cuda::stream_ref{}, bytes, alignment); +} + +void numa_region_pinned_host_memory_resource::deallocate_sync( + void* ptr, std::size_t bytes, [[maybe_unused]] std::size_t alignment) noexcept +{ + deallocate(cuda::stream_ref{}, ptr, bytes, alignment); +} + +bool numa_region_pinned_host_memory_resource::operator==( + numa_region_pinned_host_memory_resource const& other) const noexcept { - auto* mr_ptr = dynamic_cast(&other); - return mr_ptr == this && mr_ptr->_numa_node == this->_numa_node; + return this == &other && _numa_node == other._numa_node; } } // namespace memory diff --git a/src/memory/reservation_aware_resource_adaptor.cpp b/src/memory/reservation_aware_resource_adaptor.cpp index e864ace..5412e81 100644 --- a/src/memory/reservation_aware_resource_adaptor.cpp +++ b/src/memory/reservation_aware_resource_adaptor.cpp @@ -15,18 +15,17 @@ * limitations under the License. */ +#include #include #include #include #include -#include - -#include - #include #include +#include + #include #include #include @@ -44,7 +43,6 @@ using device_reserved_arena = reservation_aware_resource_adaptor::device_reserve namespace { - struct stream_ordered_allocation_tracker : public reservation_aware_resource_adaptor::allocation_tracker_iface { mutable std::mutex mutex; @@ -378,14 +376,16 @@ std::size_t reservation_aware_resource_adaptor::get_active_reservation_count() c return _number_of_allocations.load(); } -void* reservation_aware_resource_adaptor::do_allocate(std::size_t bytes, - rmm::cuda_stream_view stream) +void* reservation_aware_resource_adaptor::allocate(cuda::stream_ref stream, + std::size_t bytes, + [[maybe_unused]] std::size_t alignment) { - auto* reservation_state = _allocation_tracker->get_tracker_state(stream); + rmm::cuda_stream_view stream_view(stream.get()); + auto* reservation_state = _allocation_tracker->get_tracker_state(stream_view); if (reservation_state != nullptr) { - return do_allocate_managed(bytes, reservation_state, stream); + return do_allocate_managed(bytes, reservation_state, stream_view); } else { - return do_allocate_managed(bytes, stream); + return do_allocate_managed(bytes, stream_view); } } @@ -445,8 +445,11 @@ void* reservation_aware_resource_adaptor::do_allocate_unmanaged(std::size_t allo return _upstream.allocate(stream, allocation_bytes); } catch (std::exception& e) { _total_allocated_bytes.sub(tracking_bytes); - throw cucascade_out_of_memory( - e.what(), MemoryError::ALLOCATION_FAILED, allocation_bytes, post_allocation_size, _pool_handle); + throw cucascade_out_of_memory(e.what(), + MemoryError::ALLOCATION_FAILED, + allocation_bytes, + post_allocation_size, + _pool_handle); } } else { throw cucascade_out_of_memory("not enough capacity to allocate memory", @@ -457,13 +460,15 @@ void* reservation_aware_resource_adaptor::do_allocate_unmanaged(std::size_t allo } } -void reservation_aware_resource_adaptor::do_deallocate(void* ptr, - std::size_t bytes, - rmm::cuda_stream_view stream) noexcept +void reservation_aware_resource_adaptor::deallocate(cuda::stream_ref stream, + void* ptr, + std::size_t bytes, + [[maybe_unused]] std::size_t alignment) noexcept { + rmm::cuda_stream_view stream_view(stream.get()); auto tracking_bytes = rmm::align_up(bytes, rmm::CUDA_ALLOCATION_ALIGNMENT); auto upstream_reclaimed_bytes = tracking_bytes; - auto* reservation_state = _allocation_tracker->get_tracker_state(stream); + auto* reservation_state = _allocation_tracker->get_tracker_state(stream_view); if (reservation_state != nullptr) { auto* reservation = reservation_state->memory_reservation.get(); auto reservation_size = static_cast(reservation->size()); @@ -481,22 +486,18 @@ void reservation_aware_resource_adaptor::do_deallocate(void* ptr, // Suppress false-positive null-dereference warnings from CCCL library code #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wnull-dereference" - _upstream.deallocate(stream, ptr, bytes); + _upstream.deallocate(stream_view, ptr, bytes); #pragma GCC diagnostic pop _total_allocated_bytes.sub(upstream_reclaimed_bytes); } -bool reservation_aware_resource_adaptor::do_is_equal( - const rmm::mr::device_memory_resource& other) const noexcept +bool reservation_aware_resource_adaptor::operator==( + reservation_aware_resource_adaptor const& other) const noexcept { - // Check if it's the same type - const auto* other_adaptor = dynamic_cast(&other); - if (other_adaptor == nullptr) { return false; } - // Suppress false-positive null-dereference warnings from CCCL library code #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wnull-dereference" - return _upstream == other_adaptor->get_upstream_resource(); + return _upstream == other.get_upstream_resource(); #pragma GCC diagnostic pop } diff --git a/src/memory/reservation_manager_configurator.cpp b/src/memory/reservation_manager_configurator.cpp index d15e4c0..b0dddec 100644 --- a/src/memory/reservation_manager_configurator.cpp +++ b/src/memory/reservation_manager_configurator.cpp @@ -21,7 +21,6 @@ #include #include -#include #include @@ -209,7 +208,7 @@ std::vector reservation_manager_configurator::build( (info.space_id == info.hw_id) ? _gpu_mr_fn : [current_mr_fn = _gpu_mr_fn, hw_id = info.hw_id]( - int, size_t capacity) -> std::unique_ptr { + int, size_t capacity) -> cuda::mr::any_resource { return current_mr_fn(hw_id, capacity); }; config.reservation_limit_fraction = _gpu_reservation.get_fraction(info.gpu_capacity); @@ -232,7 +231,7 @@ std::vector reservation_manager_configurator::build( (info.space_id == info.numa_id) ? _cpu_mr_fn : [current_mr_fn = _cpu_mr_fn, numa_id = info.numa_id]( - int, size_t capacity) -> std::unique_ptr { + int, size_t capacity) -> cuda::mr::any_resource { return current_mr_fn(numa_id, capacity); }; config.reservation_limit_fraction = _cpu_reservation.get_fraction(per_host_capacity); diff --git a/src/memory/small_pinned_host_memory_resource.cpp b/src/memory/small_pinned_host_memory_resource.cpp index 051a3d4..aabd515 100644 --- a/src/memory/small_pinned_host_memory_resource.cpp +++ b/src/memory/small_pinned_host_memory_resource.cpp @@ -37,8 +37,9 @@ small_pinned_host_memory_resource::~small_pinned_host_memory_resource() // free_lists_ entries are raw pointers into those blocks; no individual cleanup needed. } -void* small_pinned_host_memory_resource::do_allocate(std::size_t bytes, - [[maybe_unused]] rmm::cuda_stream_view stream) +void* small_pinned_host_memory_resource::allocate([[maybe_unused]] cuda::stream_ref stream, + std::size_t bytes, + [[maybe_unused]] std::size_t alignment) { if (bytes == 0) { return nullptr; } // cuDF calls get_pinned_memory_resource() directly from some code paths (e.g. join/sort @@ -50,9 +51,7 @@ void* small_pinned_host_memory_resource::do_allocate(std::size_t bytes, if (bytes > MAX_SLAB_SIZE) { void* ptr = nullptr; auto err = ::cudaMallocHost(&ptr, bytes); - if (err != cudaSuccess) { - throw std::bad_alloc{}; - } + if (err != cudaSuccess) { throw std::bad_alloc{}; } return ptr; } @@ -64,8 +63,10 @@ void* small_pinned_host_memory_resource::do_allocate(std::size_t bytes, return ptr; } -void small_pinned_host_memory_resource::do_deallocate( - void* ptr, std::size_t bytes, [[maybe_unused]] rmm::cuda_stream_view stream) noexcept +void small_pinned_host_memory_resource::deallocate([[maybe_unused]] cuda::stream_ref stream, + void* ptr, + std::size_t bytes, + [[maybe_unused]] std::size_t alignment) noexcept { if (ptr == nullptr || bytes == 0) { return; } if (bytes > MAX_SLAB_SIZE) { @@ -78,8 +79,8 @@ void small_pinned_host_memory_resource::do_deallocate( free_lists_[idx].push_back(ptr); } -bool small_pinned_host_memory_resource::do_is_equal( - const rmm::mr::device_memory_resource& other) const noexcept +bool small_pinned_host_memory_resource::operator==( + small_pinned_host_memory_resource const& other) const noexcept { return this == &other; } diff --git a/test/memory/test_memory_reservation_manager.cpp b/test/memory/test_memory_reservation_manager.cpp index 30b7ced..0edc176 100644 --- a/test/memory/test_memory_reservation_manager.cpp +++ b/test/memory/test_memory_reservation_manager.cpp @@ -491,8 +491,7 @@ SCENARIO("Reservation On Multi Gpu System", "[memory_space][.multi-device]") GIVEN("Dual gpu manager") { auto* gpu_space = manager->get_memory_space(Tier::GPU, 0); - auto* mr = - dynamic_cast(gpu_space->get_default_allocator()); + auto* mr = gpu_space->get_memory_resource_as(); REQUIRE(mr != nullptr); WHEN("a reservation doesn't fit on gpu 0 but fits on gpu 1") diff --git a/test/memory/test_small_pinned_host_memory_resource.cpp b/test/memory/test_small_pinned_host_memory_resource.cpp index 7145986..8d7ba70 100644 --- a/test/memory/test_small_pinned_host_memory_resource.cpp +++ b/test/memory/test_small_pinned_host_memory_resource.cpp @@ -174,11 +174,11 @@ TEST_CASE("Freed slabs are reused", "[small_pinned]") TEST_CASE("do_is_equal identity check", "[small_pinned]") { test_fixture f; - REQUIRE(f.slab_mr.is_equal(f.slab_mr)); + REQUIRE(f.slab_mr == f.slab_mr); // A second instance should not be equal small_pinned_host_memory_resource other{f.upstream}; - REQUIRE_FALSE(f.slab_mr.is_equal(other)); + REQUIRE_FALSE(f.slab_mr == other); } TEST_CASE("Concurrent allocations are thread-safe", "[small_pinned][threading]") diff --git a/test/unittest.cpp b/test/unittest.cpp index 1fb5f10..971273a 100644 --- a/test/unittest.cpp +++ b/test/unittest.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -48,15 +49,13 @@ class test_gpu_pool { if (initial_bytes > max_bytes) { initial_bytes = max_bytes; } pool_ = std::make_unique(initial_bytes, max_bytes); - previous_ = rmm::mr::get_current_device_resource(); - rmm::mr::set_current_device_resource(pool_.get()); + previous_ = rmm::mr::get_current_device_resource_ref(); + rmm::mr::set_current_device_resource_ref(*pool_); } ~test_gpu_pool() { - if (pool_ != nullptr && previous_ != nullptr) { - rmm::mr::set_current_device_resource(previous_); - } + if (pool_ != nullptr) { rmm::mr::set_current_device_resource_ref(previous_); } } private: @@ -75,7 +74,7 @@ class test_gpu_pool { static constexpr std::size_t default_max_bytes = 10ULL * 1024 * 1024 * 1024; std::unique_ptr pool_; - rmm::mr::device_memory_resource* previous_{nullptr}; + rmm::device_async_resource_ref previous_{rmm::mr::get_current_device_resource_ref()}; }; test_gpu_pool global_pool; diff --git a/test/utils/cudf_test_utils.cpp b/test/utils/cudf_test_utils.cpp index 3a7a6ff..211638b 100644 --- a/test/utils/cudf_test_utils.cpp +++ b/test/utils/cudf_test_utils.cpp @@ -15,15 +15,18 @@ * limitations under the License. */ +#include + #include #include -#include - #include #include #include +#include +#include +#include #include #include @@ -33,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -113,7 +117,8 @@ bool cudf_tables_have_equal_contents_on_stream(const cudf::table& left, std::vector left_data(data_bytes); std::vector right_data(data_bytes); - CUCASCADE_CUDA_TRY(cudaMemcpy(left_data.data(), left_col.head(), data_bytes, cudaMemcpyDeviceToHost)); + CUCASCADE_CUDA_TRY( + cudaMemcpy(left_data.data(), left_col.head(), data_bytes, cudaMemcpyDeviceToHost)); CUCASCADE_CUDA_TRY( cudaMemcpy(right_data.data(), right_col.head(), data_bytes, cudaMemcpyDeviceToHost)); @@ -150,51 +155,53 @@ void expect_cudf_tables_equal_on_stream(const cudf::table& left, } // Simple logging adaptor to print all RMM device allocations/frees with pointers/sizes/stream/tid -class logging_device_resource : public rmm::mr::device_memory_resource { +class logging_device_resource { public: - explicit logging_device_resource(rmm::mr::device_memory_resource* upstream) : _upstream(upstream) - { - } + explicit logging_device_resource(rmm::device_async_resource_ref upstream) : _upstream(upstream) {} - ~logging_device_resource() override = default; + ~logging_device_resource() = default; - private: - void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override + void* allocate(cuda::stream_ref stream, + std::size_t bytes, + std::size_t alignment = alignof(std::max_align_t)) { - void* ptr = _upstream->allocate(stream, bytes); + void* ptr = _upstream.allocate(stream, bytes, alignment); std::ostringstream oss; - oss << "[rmm-alloc] ptr=" << ptr << " size=" << bytes << " stream=" << stream.value() + oss << "[rmm-alloc] ptr=" << ptr << " size=" << bytes << " stream=" << stream.get() << " tid=" << std::this_thread::get_id(); std::cout << oss.str() << std::endl << std::flush; return ptr; } - void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) noexcept override + void deallocate(cuda::stream_ref stream, + void* ptr, + std::size_t bytes, + std::size_t alignment = alignof(std::max_align_t)) noexcept { std::ostringstream oss; - oss << "[rmm-free ] ptr=" << ptr << " size=" << bytes << " stream=" << stream.value() + oss << "[rmm-free ] ptr=" << ptr << " size=" << bytes << " stream=" << stream.get() << " tid=" << std::this_thread::get_id(); std::cout << oss.str() << std::endl << std::flush; - _upstream->deallocate(stream, ptr, bytes); + _upstream.deallocate(stream, ptr, bytes, alignment); } - bool do_is_equal(const rmm::mr::device_memory_resource& other) const noexcept override - { - return this == &other; - } + bool operator==(logging_device_resource const& other) const noexcept { return this == &other; } + + friend void get_property(logging_device_resource const&, cuda::mr::device_accessible) noexcept {} - rmm::mr::device_memory_resource* _upstream; + private: + rmm::device_async_resource_ref _upstream; }; // Install the logging resource once per process (wraps whatever the current device resource is) static void install_rmm_logging_resource_once() { static bool installed = false; - static std::unique_ptr logging_resource; + static std::optional> logging_resource; if (!installed) { - auto* prev = rmm::mr::get_current_device_resource(); - logging_resource = std::make_unique(prev); - rmm::mr::set_current_device_resource(logging_resource.get()); + auto prev = rmm::mr::get_current_device_resource_ref(); + logging_resource.emplace(logging_device_resource{prev}); + rmm::mr::set_current_device_resource_ref(*logging_resource); installed = true; std::cout << "[rmm-log ] installed logging device resource adaptor" << std::endl << std::flush; } diff --git a/test/utils/mock_test_utils.hpp b/test/utils/mock_test_utils.hpp index d798934..27ca131 100644 --- a/test/utils/mock_test_utils.hpp +++ b/test/utils/mock_test_utils.hpp @@ -19,6 +19,7 @@ #include "utils/test_memory_resources.hpp" +#include #include #include #include @@ -32,11 +33,12 @@ #include #include -#include - #include #include +#include +#include +#include #include #include @@ -60,7 +62,7 @@ inline std::shared_ptr make_mock_memory_space(memory::Tier config.device_id = static_cast(device_id); config.memory_capacity = 1024 * 1024 * 1024; config.mr_factory_fn = [](int, size_t) { - return std::make_unique(); + return cuda::mr::any_resource{rmm::mr::cuda_memory_resource{}}; }; return std::make_shared(config); } else if (tier == memory::Tier::HOST) { @@ -69,7 +71,8 @@ inline std::shared_ptr make_mock_memory_space(memory::Tier config.memory_capacity = 1024 * 1024 * 1024; config.initial_number_pools = 0; config.mr_factory_fn = [](int, size_t) { - return std::make_unique(-1); + return cuda::mr::any_resource{ + cucascade::memory::numa_region_pinned_host_memory_resource{-1}}; }; return std::make_shared(config); } else if (tier == memory::Tier::DISK) { @@ -154,8 +157,8 @@ inline std::vector create_conversion_test_configs() inline cudf::table create_simple_cudf_table( int num_rows, int num_columns, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - rmm::cuda_stream_view stream = rmm::cuda_stream_default) + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource_ref(), + rmm::cuda_stream_view stream = rmm::cuda_stream_default) { std::vector> columns; @@ -187,15 +190,15 @@ inline cudf::table create_simple_cudf_table( inline cudf::table create_simple_cudf_table( int num_rows, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - rmm::cuda_stream_view stream = rmm::cuda_stream_default) + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource_ref(), + rmm::cuda_stream_view stream = rmm::cuda_stream_default) { return create_simple_cudf_table(num_rows, 2, mr, stream); } inline cudf::table create_simple_cudf_table( - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - rmm::cuda_stream_view stream = rmm::cuda_stream_default) + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource_ref(), + rmm::cuda_stream_view stream = rmm::cuda_stream_default) { return create_simple_cudf_table(100, 2, mr, stream); } diff --git a/test/utils/test_memory_resources.hpp b/test/utils/test_memory_resources.hpp index 2974390..d1c7c3a 100644 --- a/test/utils/test_memory_resources.hpp +++ b/test/utils/test_memory_resources.hpp @@ -17,43 +17,45 @@ #pragma once -#include -#include #include +#include -#include +#include namespace cucascade::test { -class shared_device_resource : public rmm::mr::device_memory_resource { +class shared_device_resource { public: - explicit shared_device_resource(rmm::mr::device_memory_resource* upstream) : upstream_(upstream) - { - } + explicit shared_device_resource(rmm::device_async_resource_ref upstream) : upstream_(upstream) {} - private: - void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override + void* allocate(cuda::stream_ref stream, + std::size_t bytes, + std::size_t alignment = alignof(std::max_align_t)) { - return upstream_->allocate(stream, bytes); + return upstream_.allocate(stream, bytes, alignment); } - void do_deallocate(void* p, std::size_t bytes, rmm::cuda_stream_view stream) noexcept override + void deallocate(cuda::stream_ref stream, + void* p, + std::size_t bytes, + std::size_t alignment = alignof(std::max_align_t)) noexcept { - upstream_->deallocate(stream, p, bytes); + upstream_.deallocate(stream, p, bytes, alignment); } - bool do_is_equal(rmm::mr::device_memory_resource const& other) const noexcept override - { - return upstream_ == &other; - } + bool operator==(shared_device_resource const&) const noexcept { return false; } + + friend void get_property(shared_device_resource const&, cuda::mr::device_accessible) noexcept {} - rmm::mr::device_memory_resource* upstream_; + private: + rmm::device_async_resource_ref upstream_; }; -inline std::unique_ptr make_shared_current_device_resource(int, - size_t) +inline cuda::mr::any_resource make_shared_current_device_resource( + int, size_t) { - return std::make_unique(rmm::mr::get_current_device_resource()); + return cuda::mr::any_resource{ + shared_device_resource{rmm::mr::get_current_device_resource_ref()}}; } } // namespace cucascade::test