[flang][cuda][rt] Track asynchronous allocation stream for deallocation (#137073)

When an asynchronous allocation is made, we call `cudaMallocAsync` with
a stream. For deallocation, we need to call `cudaFreeAsync` with the
same stream. in order to achieve that, we need to track the allocation
and their respective stream.

This patch adds a simple sorted array of asynchronous allocations. A
binary search is performed to retrieve the allocation when deallocation
is needed.
This commit is contained in:
Valentin Clement (バレンタイン クレメン) 2025-04-24 10:01:47 -07:00 committed by GitHub
parent e329b6c530
commit 565a075909
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 171 additions and 1 deletions

View File

@ -11,6 +11,7 @@
#include "flang-rt/runtime/derived.h"
#include "flang-rt/runtime/descriptor.h"
#include "flang-rt/runtime/environment.h"
#include "flang-rt/runtime/lock.h"
#include "flang-rt/runtime/stat.h"
#include "flang-rt/runtime/terminator.h"
#include "flang-rt/runtime/type-info.h"
@ -21,6 +22,105 @@
#include "cuda_runtime.h"
namespace Fortran::runtime::cuda {
struct DeviceAllocation {
void *ptr;
std::size_t size;
cudaStream_t stream;
};
// Compare address values. nullptr will be sorted at the end of the array.
int compareDeviceAlloc(const void *a, const void *b) {
const DeviceAllocation *deva = (const DeviceAllocation *)a;
const DeviceAllocation *devb = (const DeviceAllocation *)b;
if (deva->ptr == nullptr && devb->ptr == nullptr)
return 0;
if (deva->ptr == nullptr)
return 1;
if (devb->ptr == nullptr)
return -1;
return deva->ptr < devb->ptr ? -1 : (deva->ptr > devb->ptr ? 1 : 0);
}
// Dynamic array for tracking asynchronous allocations.
static DeviceAllocation *deviceAllocations = nullptr;
Lock lock;
static int maxDeviceAllocations{512}; // Initial size
static int numDeviceAllocations{0};
static constexpr int allocNotFound{-1};
static void initAllocations() {
if (!deviceAllocations) {
deviceAllocations = static_cast<DeviceAllocation *>(
malloc(maxDeviceAllocations * sizeof(DeviceAllocation)));
if (!deviceAllocations) {
Terminator terminator{__FILE__, __LINE__};
terminator.Crash("Failed to allocate tracking array");
}
}
}
static void doubleAllocationArray() {
unsigned newSize = maxDeviceAllocations * 2;
DeviceAllocation *newArray = static_cast<DeviceAllocation *>(
realloc(deviceAllocations, newSize * sizeof(DeviceAllocation)));
if (!newArray) {
Terminator terminator{__FILE__, __LINE__};
terminator.Crash("Failed to reallocate tracking array");
}
deviceAllocations = newArray;
maxDeviceAllocations = newSize;
}
static unsigned findAllocation(void *ptr) {
if (numDeviceAllocations == 0) {
return allocNotFound;
}
int left{0};
int right{numDeviceAllocations - 1};
if (left == right) {
return left;
}
while (left <= right) {
int mid = left + (right - left) / 2;
if (deviceAllocations[mid].ptr == ptr) {
return mid;
}
if (deviceAllocations[mid].ptr < ptr) {
left = mid + 1;
} else {
right = mid - 1;
}
}
return allocNotFound;
}
static void insertAllocation(void *ptr, std::size_t size, std::int64_t stream) {
CriticalSection critical{lock};
initAllocations();
if (numDeviceAllocations >= maxDeviceAllocations) {
doubleAllocationArray();
}
deviceAllocations[numDeviceAllocations].ptr = ptr;
deviceAllocations[numDeviceAllocations].size = size;
deviceAllocations[numDeviceAllocations].stream = (cudaStream_t)stream;
++numDeviceAllocations;
qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
compareDeviceAlloc);
}
static void eraseAllocation(int pos) {
deviceAllocations[pos].ptr = nullptr;
deviceAllocations[pos].size = 0;
deviceAllocations[pos].stream = (cudaStream_t)0;
qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
compareDeviceAlloc);
--numDeviceAllocations;
}
extern "C" {
void RTDEF(CUFRegisterAllocator)() {
@ -55,12 +155,23 @@ void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t asyncId) {
} else {
CUDA_REPORT_IF_ERROR(
cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)asyncId));
insertAllocation(p, sizeInBytes, asyncId);
}
}
return p;
}
void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
void CUFFreeDevice(void *p) {
CriticalSection critical{lock};
int pos = findAllocation(p);
if (pos >= 0) {
cudaStream_t stream = deviceAllocations[pos].stream;
eraseAllocation(pos);
CUDA_REPORT_IF_ERROR(cudaFreeAsync(p, stream));
} else {
CUDA_REPORT_IF_ERROR(cudaFree(p));
}
}
void *CUFAllocManaged(
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {

View File

@ -58,3 +58,62 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocatable) {
EXPECT_EQ(cudaSuccess, cudaGetLastError());
}
TEST(AllocatableCUFTest, StreamDeviceAllocatable) {
using Fortran::common::TypeCategory;
RTNAME(CUFRegisterAllocator)();
// REAL(4), DEVICE, ALLOCATABLE :: a(:)
auto a{createAllocatable(TypeCategory::Real, 4)};
a->SetAllocIdx(kDeviceAllocatorPos);
EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx());
EXPECT_FALSE(a->HasAddendum());
RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
auto b{createAllocatable(TypeCategory::Real, 4)};
b->SetAllocIdx(kDeviceAllocatorPos);
EXPECT_EQ((int)kDeviceAllocatorPos, b->GetAllocIdx());
EXPECT_FALSE(b->HasAddendum());
RTNAME(AllocatableSetBounds)(*b, 0, 1, 20);
auto c{createAllocatable(TypeCategory::Real, 4)};
c->SetAllocIdx(kDeviceAllocatorPos);
EXPECT_EQ((int)kDeviceAllocatorPos, c->GetAllocIdx());
EXPECT_FALSE(b->HasAddendum());
RTNAME(AllocatableSetBounds)(*c, 0, 1, 100);
RTNAME(AllocatableAllocate)
(*a, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
EXPECT_TRUE(a->IsAllocated());
cudaDeviceSynchronize();
EXPECT_EQ(cudaSuccess, cudaGetLastError());
RTNAME(AllocatableAllocate)
(*b, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
EXPECT_TRUE(b->IsAllocated());
cudaDeviceSynchronize();
EXPECT_EQ(cudaSuccess, cudaGetLastError());
RTNAME(AllocatableAllocate)
(*c, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
EXPECT_TRUE(c->IsAllocated());
cudaDeviceSynchronize();
EXPECT_EQ(cudaSuccess, cudaGetLastError());
RTNAME(AllocatableDeallocate)
(*b, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
EXPECT_FALSE(b->IsAllocated());
cudaDeviceSynchronize();
EXPECT_EQ(cudaSuccess, cudaGetLastError());
RTNAME(AllocatableDeallocate)
(*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
EXPECT_FALSE(a->IsAllocated());
cudaDeviceSynchronize();
EXPECT_EQ(cudaSuccess, cudaGetLastError());
RTNAME(AllocatableDeallocate)
(*c, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
EXPECT_FALSE(c->IsAllocated());
cudaDeviceSynchronize();
EXPECT_EQ(cudaSuccess, cudaGetLastError());
}