From 565a075909046f74c2fbb7713419518464599a4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Thu, 24 Apr 2025 10:01:47 -0700 Subject: [PATCH] [flang][cuda][rt] Track asynchronous allocation stream for deallocation (#137073) When an asynchronous allocation is made, we call `cudaMallocAsync` with a stream. For deallocation, we need to call `cudaFreeAsync` with the same stream. in order to achieve that, we need to track the allocation and their respective stream. This patch adds a simple sorted array of asynchronous allocations. A binary search is performed to retrieve the allocation when deallocation is needed. --- flang-rt/lib/cuda/allocator.cpp | 113 +++++++++++++++++- .../unittests/Runtime/CUDA/Allocatable.cpp | 59 +++++++++ 2 files changed, 171 insertions(+), 1 deletion(-) diff --git a/flang-rt/lib/cuda/allocator.cpp b/flang-rt/lib/cuda/allocator.cpp index a1c3a2c1b2ea..51119ab25116 100644 --- a/flang-rt/lib/cuda/allocator.cpp +++ b/flang-rt/lib/cuda/allocator.cpp @@ -11,6 +11,7 @@ #include "flang-rt/runtime/derived.h" #include "flang-rt/runtime/descriptor.h" #include "flang-rt/runtime/environment.h" +#include "flang-rt/runtime/lock.h" #include "flang-rt/runtime/stat.h" #include "flang-rt/runtime/terminator.h" #include "flang-rt/runtime/type-info.h" @@ -21,6 +22,105 @@ #include "cuda_runtime.h" namespace Fortran::runtime::cuda { + +struct DeviceAllocation { + void *ptr; + std::size_t size; + cudaStream_t stream; +}; + +// Compare address values. nullptr will be sorted at the end of the array. +int compareDeviceAlloc(const void *a, const void *b) { + const DeviceAllocation *deva = (const DeviceAllocation *)a; + const DeviceAllocation *devb = (const DeviceAllocation *)b; + if (deva->ptr == nullptr && devb->ptr == nullptr) + return 0; + if (deva->ptr == nullptr) + return 1; + if (devb->ptr == nullptr) + return -1; + return deva->ptr < devb->ptr ? -1 : (deva->ptr > devb->ptr ? 1 : 0); +} + +// Dynamic array for tracking asynchronous allocations. +static DeviceAllocation *deviceAllocations = nullptr; +Lock lock; +static int maxDeviceAllocations{512}; // Initial size +static int numDeviceAllocations{0}; +static constexpr int allocNotFound{-1}; + +static void initAllocations() { + if (!deviceAllocations) { + deviceAllocations = static_cast( + malloc(maxDeviceAllocations * sizeof(DeviceAllocation))); + if (!deviceAllocations) { + Terminator terminator{__FILE__, __LINE__}; + terminator.Crash("Failed to allocate tracking array"); + } + } +} + +static void doubleAllocationArray() { + unsigned newSize = maxDeviceAllocations * 2; + DeviceAllocation *newArray = static_cast( + realloc(deviceAllocations, newSize * sizeof(DeviceAllocation))); + if (!newArray) { + Terminator terminator{__FILE__, __LINE__}; + terminator.Crash("Failed to reallocate tracking array"); + } + deviceAllocations = newArray; + maxDeviceAllocations = newSize; +} + +static unsigned findAllocation(void *ptr) { + if (numDeviceAllocations == 0) { + return allocNotFound; + } + + int left{0}; + int right{numDeviceAllocations - 1}; + + if (left == right) { + return left; + } + + while (left <= right) { + int mid = left + (right - left) / 2; + if (deviceAllocations[mid].ptr == ptr) { + return mid; + } + if (deviceAllocations[mid].ptr < ptr) { + left = mid + 1; + } else { + right = mid - 1; + } + } + return allocNotFound; +} + +static void insertAllocation(void *ptr, std::size_t size, std::int64_t stream) { + CriticalSection critical{lock}; + initAllocations(); + if (numDeviceAllocations >= maxDeviceAllocations) { + doubleAllocationArray(); + } + deviceAllocations[numDeviceAllocations].ptr = ptr; + deviceAllocations[numDeviceAllocations].size = size; + deviceAllocations[numDeviceAllocations].stream = (cudaStream_t)stream; + ++numDeviceAllocations; + qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation), + compareDeviceAlloc); +} + +static void eraseAllocation(int pos) { + deviceAllocations[pos].ptr = nullptr; + deviceAllocations[pos].size = 0; + deviceAllocations[pos].stream = (cudaStream_t)0; + qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation), + compareDeviceAlloc); + --numDeviceAllocations; +} + extern "C" { void RTDEF(CUFRegisterAllocator)() { @@ -55,12 +155,23 @@ void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t asyncId) { } else { CUDA_REPORT_IF_ERROR( cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)asyncId)); + insertAllocation(p, sizeInBytes, asyncId); } } return p; } -void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); } +void CUFFreeDevice(void *p) { + CriticalSection critical{lock}; + int pos = findAllocation(p); + if (pos >= 0) { + cudaStream_t stream = deviceAllocations[pos].stream; + eraseAllocation(pos); + CUDA_REPORT_IF_ERROR(cudaFreeAsync(p, stream)); + } else { + CUDA_REPORT_IF_ERROR(cudaFree(p)); + } +} void *CUFAllocManaged( std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) { diff --git a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp index 1c8ded0f87d4..89649aa95ad9 100644 --- a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp +++ b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp @@ -58,3 +58,62 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocatable) { EXPECT_EQ(cudaSuccess, cudaGetLastError()); } + +TEST(AllocatableCUFTest, StreamDeviceAllocatable) { + using Fortran::common::TypeCategory; + RTNAME(CUFRegisterAllocator)(); + // REAL(4), DEVICE, ALLOCATABLE :: a(:) + auto a{createAllocatable(TypeCategory::Real, 4)}; + a->SetAllocIdx(kDeviceAllocatorPos); + EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx()); + EXPECT_FALSE(a->HasAddendum()); + RTNAME(AllocatableSetBounds)(*a, 0, 1, 10); + + auto b{createAllocatable(TypeCategory::Real, 4)}; + b->SetAllocIdx(kDeviceAllocatorPos); + EXPECT_EQ((int)kDeviceAllocatorPos, b->GetAllocIdx()); + EXPECT_FALSE(b->HasAddendum()); + RTNAME(AllocatableSetBounds)(*b, 0, 1, 20); + + auto c{createAllocatable(TypeCategory::Real, 4)}; + c->SetAllocIdx(kDeviceAllocatorPos); + EXPECT_EQ((int)kDeviceAllocatorPos, c->GetAllocIdx()); + EXPECT_FALSE(b->HasAddendum()); + RTNAME(AllocatableSetBounds)(*c, 0, 1, 100); + + RTNAME(AllocatableAllocate) + (*a, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); + EXPECT_TRUE(a->IsAllocated()); + cudaDeviceSynchronize(); + EXPECT_EQ(cudaSuccess, cudaGetLastError()); + + RTNAME(AllocatableAllocate) + (*b, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); + EXPECT_TRUE(b->IsAllocated()); + cudaDeviceSynchronize(); + EXPECT_EQ(cudaSuccess, cudaGetLastError()); + + RTNAME(AllocatableAllocate) + (*c, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); + EXPECT_TRUE(c->IsAllocated()); + cudaDeviceSynchronize(); + EXPECT_EQ(cudaSuccess, cudaGetLastError()); + + RTNAME(AllocatableDeallocate) + (*b, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); + EXPECT_FALSE(b->IsAllocated()); + cudaDeviceSynchronize(); + EXPECT_EQ(cudaSuccess, cudaGetLastError()); + + RTNAME(AllocatableDeallocate) + (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); + EXPECT_FALSE(a->IsAllocated()); + cudaDeviceSynchronize(); + EXPECT_EQ(cudaSuccess, cudaGetLastError()); + + RTNAME(AllocatableDeallocate) + (*c, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); + EXPECT_FALSE(c->IsAllocated()); + cudaDeviceSynchronize(); + EXPECT_EQ(cudaSuccess, cudaGetLastError()); +}