From 565a075909046f74c2fbb7713419518464599a4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Thu, 24 Apr 2025 10:01:47 -0700
Subject: [PATCH] [flang][cuda][rt] Track asynchronous allocation stream for
 deallocation (#137073)

When an asynchronous allocation is made, we call `cudaMallocAsync` with
a stream. For deallocation, we need to call `cudaFreeAsync` with the
same stream. in order to achieve that, we need to track the allocation
and their respective stream.

This patch adds a simple sorted array of asynchronous allocations. A
binary search is performed to retrieve the allocation when deallocation
is needed.
---
 flang-rt/lib/cuda/allocator.cpp               | 113 +++++++++++++++++-
 .../unittests/Runtime/CUDA/Allocatable.cpp    |  59 +++++++++
 2 files changed, 171 insertions(+), 1 deletion(-)
diff --git a/flang-rt/lib/cuda/allocator.cpp b/flang-rt/lib/cuda/allocator.cpp
index a1c3a2c1b2ea..51119ab25116 100644
--- a/flang-rt/lib/cuda/allocator.cpp
+++ b/flang-rt/lib/cuda/allocator.cpp
@@ -11,6 +11,7 @@
 #include "flang-rt/runtime/derived.h"
 #include "flang-rt/runtime/descriptor.h"
 #include "flang-rt/runtime/environment.h"
+#include "flang-rt/runtime/lock.h"
 #include "flang-rt/runtime/stat.h"
 #include "flang-rt/runtime/terminator.h"
 #include "flang-rt/runtime/type-info.h"
@@ -21,6 +22,105 @@
 #include "cuda_runtime.h"
 
 namespace Fortran::runtime::cuda {
+
+struct DeviceAllocation {
+  void *ptr;
+  std::size_t size;
+  cudaStream_t stream;
+};
+
+// Compare address values. nullptr will be sorted at the end of the array.
+int compareDeviceAlloc(const void *a, const void *b) {
+  const DeviceAllocation *deva = (const DeviceAllocation *)a;
+  const DeviceAllocation *devb = (const DeviceAllocation *)b;
+  if (deva->ptr == nullptr && devb->ptr == nullptr)
+    return 0;
+  if (deva->ptr == nullptr)
+    return 1;
+  if (devb->ptr == nullptr)
+    return -1;
+  return deva->ptr < devb->ptr ? -1 : (deva->ptr > devb->ptr ? 1 : 0);
+}
+
+// Dynamic array for tracking asynchronous allocations.
+static DeviceAllocation *deviceAllocations = nullptr;
+Lock lock;
+static int maxDeviceAllocations{512}; // Initial size
+static int numDeviceAllocations{0};
+static constexpr int allocNotFound{-1};
+
+static void initAllocations() {
+  if (!deviceAllocations) {
+    deviceAllocations = static_cast<DeviceAllocation *>(
+        malloc(maxDeviceAllocations * sizeof(DeviceAllocation)));
+    if (!deviceAllocations) {
+      Terminator terminator{__FILE__, __LINE__};
+      terminator.Crash("Failed to allocate tracking array");
+    }
+  }
+}
+
+static void doubleAllocationArray() {
+  unsigned newSize = maxDeviceAllocations * 2;
+  DeviceAllocation *newArray = static_cast<DeviceAllocation *>(
+      realloc(deviceAllocations, newSize * sizeof(DeviceAllocation)));
+  if (!newArray) {
+    Terminator terminator{__FILE__, __LINE__};
+    terminator.Crash("Failed to reallocate tracking array");
+  }
+  deviceAllocations = newArray;
+  maxDeviceAllocations = newSize;
+}
+
+static unsigned findAllocation(void *ptr) {
+  if (numDeviceAllocations == 0) {
+    return allocNotFound;
+  }
+
+  int left{0};
+  int right{numDeviceAllocations - 1};
+
+  if (left == right) {
+    return left;
+  }
+
+  while (left <= right) {
+    int mid = left + (right - left) / 2;
+    if (deviceAllocations[mid].ptr == ptr) {
+      return mid;
+    }
+    if (deviceAllocations[mid].ptr < ptr) {
+      left = mid + 1;
+    } else {
+      right = mid - 1;
+    }
+  }
+  return allocNotFound;
+}
+
+static void insertAllocation(void *ptr, std::size_t size, std::int64_t stream) {
+  CriticalSection critical{lock};
+  initAllocations();
+  if (numDeviceAllocations >= maxDeviceAllocations) {
+    doubleAllocationArray();
+  }
+  deviceAllocations[numDeviceAllocations].ptr = ptr;
+  deviceAllocations[numDeviceAllocations].size = size;
+  deviceAllocations[numDeviceAllocations].stream = (cudaStream_t)stream;
+  ++numDeviceAllocations;
+  qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
+      compareDeviceAlloc);
+}
+
+static void eraseAllocation(int pos) {
+  deviceAllocations[pos].ptr = nullptr;
+  deviceAllocations[pos].size = 0;
+  deviceAllocations[pos].stream = (cudaStream_t)0;
+  qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
+      compareDeviceAlloc);
+  --numDeviceAllocations;
+}
+
 extern "C" {
 
 void RTDEF(CUFRegisterAllocator)() {
@@ -55,12 +155,23 @@ void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t asyncId) {
     } else {
       CUDA_REPORT_IF_ERROR(
           cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)asyncId));
+      insertAllocation(p, sizeInBytes, asyncId);
     }
   }
   return p;
 }
 
-void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
+void CUFFreeDevice(void *p) {
+  CriticalSection critical{lock};
+  int pos = findAllocation(p);
+  if (pos >= 0) {
+    cudaStream_t stream = deviceAllocations[pos].stream;
+    eraseAllocation(pos);
+    CUDA_REPORT_IF_ERROR(cudaFreeAsync(p, stream));
+  } else {
+    CUDA_REPORT_IF_ERROR(cudaFree(p));
+  }
+}
 
 void *CUFAllocManaged(
     std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
diff --git a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
index 1c8ded0f87d4..89649aa95ad9 100644
--- a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
+++ b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
@@ -58,3 +58,62 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocatable) {
 
   EXPECT_EQ(cudaSuccess, cudaGetLastError());
 }
+
+TEST(AllocatableCUFTest, StreamDeviceAllocatable) {
+  using Fortran::common::TypeCategory;
+  RTNAME(CUFRegisterAllocator)();
+  // REAL(4), DEVICE, ALLOCATABLE :: a(:)
+  auto a{createAllocatable(TypeCategory::Real, 4)};
+  a->SetAllocIdx(kDeviceAllocatorPos);
+  EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx());
+  EXPECT_FALSE(a->HasAddendum());
+  RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
+
+  auto b{createAllocatable(TypeCategory::Real, 4)};
+  b->SetAllocIdx(kDeviceAllocatorPos);
+  EXPECT_EQ((int)kDeviceAllocatorPos, b->GetAllocIdx());
+  EXPECT_FALSE(b->HasAddendum());
+  RTNAME(AllocatableSetBounds)(*b, 0, 1, 20);
+
+  auto c{createAllocatable(TypeCategory::Real, 4)};
+  c->SetAllocIdx(kDeviceAllocatorPos);
+  EXPECT_EQ((int)kDeviceAllocatorPos, c->GetAllocIdx());
+  EXPECT_FALSE(b->HasAddendum());
+  RTNAME(AllocatableSetBounds)(*c, 0, 1, 100);
+
+  RTNAME(AllocatableAllocate)
+  (*a, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  EXPECT_TRUE(a->IsAllocated());
+  cudaDeviceSynchronize();
+  EXPECT_EQ(cudaSuccess, cudaGetLastError());
+
+  RTNAME(AllocatableAllocate)
+  (*b, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  EXPECT_TRUE(b->IsAllocated());
+  cudaDeviceSynchronize();
+  EXPECT_EQ(cudaSuccess, cudaGetLastError());
+
+  RTNAME(AllocatableAllocate)
+  (*c, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  EXPECT_TRUE(c->IsAllocated());
+  cudaDeviceSynchronize();
+  EXPECT_EQ(cudaSuccess, cudaGetLastError());
+
+  RTNAME(AllocatableDeallocate)
+  (*b, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  EXPECT_FALSE(b->IsAllocated());
+  cudaDeviceSynchronize();
+  EXPECT_EQ(cudaSuccess, cudaGetLastError());
+
+  RTNAME(AllocatableDeallocate)
+  (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  EXPECT_FALSE(a->IsAllocated());
+  cudaDeviceSynchronize();
+  EXPECT_EQ(cudaSuccess, cudaGetLastError());
+
+  RTNAME(AllocatableDeallocate)
+  (*c, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  EXPECT_FALSE(c->IsAllocated());
+  cudaDeviceSynchronize();
+  EXPECT_EQ(cudaSuccess, cudaGetLastError());
+}