[flang][cuda][rt] Track asynchronous allocation stream for deallocation (#137073)
When an asynchronous allocation is made, we call `cudaMallocAsync` with a stream. For deallocation, we need to call `cudaFreeAsync` with the same stream. in order to achieve that, we need to track the allocation and their respective stream. This patch adds a simple sorted array of asynchronous allocations. A binary search is performed to retrieve the allocation when deallocation is needed.
This commit is contained in:
parent
e329b6c530
commit
565a075909
@ -11,6 +11,7 @@
|
||||
#include "flang-rt/runtime/derived.h"
|
||||
#include "flang-rt/runtime/descriptor.h"
|
||||
#include "flang-rt/runtime/environment.h"
|
||||
#include "flang-rt/runtime/lock.h"
|
||||
#include "flang-rt/runtime/stat.h"
|
||||
#include "flang-rt/runtime/terminator.h"
|
||||
#include "flang-rt/runtime/type-info.h"
|
||||
@ -21,6 +22,105 @@
|
||||
#include "cuda_runtime.h"
|
||||
|
||||
namespace Fortran::runtime::cuda {
|
||||
|
||||
struct DeviceAllocation {
|
||||
void *ptr;
|
||||
std::size_t size;
|
||||
cudaStream_t stream;
|
||||
};
|
||||
|
||||
// Compare address values. nullptr will be sorted at the end of the array.
|
||||
int compareDeviceAlloc(const void *a, const void *b) {
|
||||
const DeviceAllocation *deva = (const DeviceAllocation *)a;
|
||||
const DeviceAllocation *devb = (const DeviceAllocation *)b;
|
||||
if (deva->ptr == nullptr && devb->ptr == nullptr)
|
||||
return 0;
|
||||
if (deva->ptr == nullptr)
|
||||
return 1;
|
||||
if (devb->ptr == nullptr)
|
||||
return -1;
|
||||
return deva->ptr < devb->ptr ? -1 : (deva->ptr > devb->ptr ? 1 : 0);
|
||||
}
|
||||
|
||||
// Dynamic array for tracking asynchronous allocations.
|
||||
static DeviceAllocation *deviceAllocations = nullptr;
|
||||
Lock lock;
|
||||
static int maxDeviceAllocations{512}; // Initial size
|
||||
static int numDeviceAllocations{0};
|
||||
static constexpr int allocNotFound{-1};
|
||||
|
||||
static void initAllocations() {
|
||||
if (!deviceAllocations) {
|
||||
deviceAllocations = static_cast<DeviceAllocation *>(
|
||||
malloc(maxDeviceAllocations * sizeof(DeviceAllocation)));
|
||||
if (!deviceAllocations) {
|
||||
Terminator terminator{__FILE__, __LINE__};
|
||||
terminator.Crash("Failed to allocate tracking array");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void doubleAllocationArray() {
|
||||
unsigned newSize = maxDeviceAllocations * 2;
|
||||
DeviceAllocation *newArray = static_cast<DeviceAllocation *>(
|
||||
realloc(deviceAllocations, newSize * sizeof(DeviceAllocation)));
|
||||
if (!newArray) {
|
||||
Terminator terminator{__FILE__, __LINE__};
|
||||
terminator.Crash("Failed to reallocate tracking array");
|
||||
}
|
||||
deviceAllocations = newArray;
|
||||
maxDeviceAllocations = newSize;
|
||||
}
|
||||
|
||||
static unsigned findAllocation(void *ptr) {
|
||||
if (numDeviceAllocations == 0) {
|
||||
return allocNotFound;
|
||||
}
|
||||
|
||||
int left{0};
|
||||
int right{numDeviceAllocations - 1};
|
||||
|
||||
if (left == right) {
|
||||
return left;
|
||||
}
|
||||
|
||||
while (left <= right) {
|
||||
int mid = left + (right - left) / 2;
|
||||
if (deviceAllocations[mid].ptr == ptr) {
|
||||
return mid;
|
||||
}
|
||||
if (deviceAllocations[mid].ptr < ptr) {
|
||||
left = mid + 1;
|
||||
} else {
|
||||
right = mid - 1;
|
||||
}
|
||||
}
|
||||
return allocNotFound;
|
||||
}
|
||||
|
||||
static void insertAllocation(void *ptr, std::size_t size, std::int64_t stream) {
|
||||
CriticalSection critical{lock};
|
||||
initAllocations();
|
||||
if (numDeviceAllocations >= maxDeviceAllocations) {
|
||||
doubleAllocationArray();
|
||||
}
|
||||
deviceAllocations[numDeviceAllocations].ptr = ptr;
|
||||
deviceAllocations[numDeviceAllocations].size = size;
|
||||
deviceAllocations[numDeviceAllocations].stream = (cudaStream_t)stream;
|
||||
++numDeviceAllocations;
|
||||
qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
|
||||
compareDeviceAlloc);
|
||||
}
|
||||
|
||||
static void eraseAllocation(int pos) {
|
||||
deviceAllocations[pos].ptr = nullptr;
|
||||
deviceAllocations[pos].size = 0;
|
||||
deviceAllocations[pos].stream = (cudaStream_t)0;
|
||||
qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
|
||||
compareDeviceAlloc);
|
||||
--numDeviceAllocations;
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
||||
void RTDEF(CUFRegisterAllocator)() {
|
||||
@ -55,12 +155,23 @@ void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t asyncId) {
|
||||
} else {
|
||||
CUDA_REPORT_IF_ERROR(
|
||||
cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)asyncId));
|
||||
insertAllocation(p, sizeInBytes, asyncId);
|
||||
}
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
|
||||
void CUFFreeDevice(void *p) {
|
||||
CriticalSection critical{lock};
|
||||
int pos = findAllocation(p);
|
||||
if (pos >= 0) {
|
||||
cudaStream_t stream = deviceAllocations[pos].stream;
|
||||
eraseAllocation(pos);
|
||||
CUDA_REPORT_IF_ERROR(cudaFreeAsync(p, stream));
|
||||
} else {
|
||||
CUDA_REPORT_IF_ERROR(cudaFree(p));
|
||||
}
|
||||
}
|
||||
|
||||
void *CUFAllocManaged(
|
||||
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
|
||||
|
@ -58,3 +58,62 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocatable) {
|
||||
|
||||
EXPECT_EQ(cudaSuccess, cudaGetLastError());
|
||||
}
|
||||
|
||||
TEST(AllocatableCUFTest, StreamDeviceAllocatable) {
|
||||
using Fortran::common::TypeCategory;
|
||||
RTNAME(CUFRegisterAllocator)();
|
||||
// REAL(4), DEVICE, ALLOCATABLE :: a(:)
|
||||
auto a{createAllocatable(TypeCategory::Real, 4)};
|
||||
a->SetAllocIdx(kDeviceAllocatorPos);
|
||||
EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx());
|
||||
EXPECT_FALSE(a->HasAddendum());
|
||||
RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
|
||||
|
||||
auto b{createAllocatable(TypeCategory::Real, 4)};
|
||||
b->SetAllocIdx(kDeviceAllocatorPos);
|
||||
EXPECT_EQ((int)kDeviceAllocatorPos, b->GetAllocIdx());
|
||||
EXPECT_FALSE(b->HasAddendum());
|
||||
RTNAME(AllocatableSetBounds)(*b, 0, 1, 20);
|
||||
|
||||
auto c{createAllocatable(TypeCategory::Real, 4)};
|
||||
c->SetAllocIdx(kDeviceAllocatorPos);
|
||||
EXPECT_EQ((int)kDeviceAllocatorPos, c->GetAllocIdx());
|
||||
EXPECT_FALSE(b->HasAddendum());
|
||||
RTNAME(AllocatableSetBounds)(*c, 0, 1, 100);
|
||||
|
||||
RTNAME(AllocatableAllocate)
|
||||
(*a, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
|
||||
EXPECT_TRUE(a->IsAllocated());
|
||||
cudaDeviceSynchronize();
|
||||
EXPECT_EQ(cudaSuccess, cudaGetLastError());
|
||||
|
||||
RTNAME(AllocatableAllocate)
|
||||
(*b, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
|
||||
EXPECT_TRUE(b->IsAllocated());
|
||||
cudaDeviceSynchronize();
|
||||
EXPECT_EQ(cudaSuccess, cudaGetLastError());
|
||||
|
||||
RTNAME(AllocatableAllocate)
|
||||
(*c, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
|
||||
EXPECT_TRUE(c->IsAllocated());
|
||||
cudaDeviceSynchronize();
|
||||
EXPECT_EQ(cudaSuccess, cudaGetLastError());
|
||||
|
||||
RTNAME(AllocatableDeallocate)
|
||||
(*b, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
|
||||
EXPECT_FALSE(b->IsAllocated());
|
||||
cudaDeviceSynchronize();
|
||||
EXPECT_EQ(cudaSuccess, cudaGetLastError());
|
||||
|
||||
RTNAME(AllocatableDeallocate)
|
||||
(*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
|
||||
EXPECT_FALSE(a->IsAllocated());
|
||||
cudaDeviceSynchronize();
|
||||
EXPECT_EQ(cudaSuccess, cudaGetLastError());
|
||||
|
||||
RTNAME(AllocatableDeallocate)
|
||||
(*c, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
|
||||
EXPECT_FALSE(c->IsAllocated());
|
||||
cudaDeviceSynchronize();
|
||||
EXPECT_EQ(cudaSuccess, cudaGetLastError());
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user