From 26b4c25b8bceb414471d825395e418636d9d17e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Thu, 26 Feb 2026 16:24:29 -0800 Subject: [PATCH] [flang][cuda] Add support for cudaStreamDestroy (#183648) Add specific lowering and entry point for cudaStreamDestroy. Since we keep associated stream for some allocation, we need to reset it when the stream is destroy so we don't use it anymore. --- flang-rt/lib/cuda/allocator.cpp | 9 +++++ flang-rt/lib/cuda/stream.cpp | 14 +++++--- .../unittests/Runtime/CUDA/Allocatable.cpp | 35 +++++++++++++++++++ .../Optimizer/Builder/CUDAIntrinsicCall.h | 2 ++ flang/include/flang/Runtime/CUDA/allocator.h | 2 ++ flang/include/flang/Runtime/CUDA/stream.h | 1 + .../Optimizer/Builder/CUDAIntrinsicCall.cpp | 19 ++++++++++ flang/module/cuda_runtime_api.f90 | 8 +++++ flang/test/Lower/CUDA/cuda-default-stream.cuf | 10 ++++++ 9 files changed, 96 insertions(+), 4 deletions(-) diff --git a/flang-rt/lib/cuda/allocator.cpp b/flang-rt/lib/cuda/allocator.cpp index df7e43de00c7..b66033cb8668 100644 --- a/flang-rt/lib/cuda/allocator.cpp +++ b/flang-rt/lib/cuda/allocator.cpp @@ -119,6 +119,15 @@ static void eraseAllocation(int pos) { --numDeviceAllocations; } +void CUFResetStream(cudaStream_t stream) { + CriticalSection critical{lock}; + for (int i = 0; i < numDeviceAllocations; ++i) { + if (deviceAllocations[i].stream == stream) { + deviceAllocations[i].stream = nullptr; + } + } +} + extern "C" { void RTDEF(CUFRegisterAllocator)() { diff --git a/flang-rt/lib/cuda/stream.cpp b/flang-rt/lib/cuda/stream.cpp index 20cf49989e2e..12beaa23560d 100644 --- a/flang-rt/lib/cuda/stream.cpp +++ b/flang-rt/lib/cuda/stream.cpp @@ -14,6 +14,7 @@ #include "flang-rt/runtime/lock.h" #include "flang-rt/runtime/stat.h" #include "flang-rt/runtime/terminator.h" +#include "flang/Runtime/CUDA/allocator.h" #include "flang/Runtime/CUDA/common.h" #include "flang/Support/Fortran.h" @@ -23,20 +24,25 @@ static thread_local cudaStream_t defaultStream{nullptr}; extern "C" { -int RTDECL(CUFSetDefaultStream)(cudaStream_t stream) { +int RTDEF(CUFSetDefaultStream)(cudaStream_t stream) { defaultStream = stream; return StatOk; } -cudaStream_t RTDECL(CUFGetDefaultStream)() { return defaultStream; } +cudaStream_t RTDEF(CUFGetDefaultStream)() { return defaultStream; } -int RTDECL(CUFStreamSynchronize)(cudaStream_t stream) { +int RTDEF(CUFStreamSynchronize)(cudaStream_t stream) { return cudaStreamSynchronize(stream); } -int RTDECL(CUFStreamSynchronizeNull)() { +int RTDEF(CUFStreamSynchronizeNull)() { return cudaStreamSynchronize(RTNAME(CUFGetDefaultStream)()); } + +int RTDEF(CUFStreamDestroy)(cudaStream_t stream) { + CUFResetStream(stream); + return cudaStreamDestroy(stream); +} } } // namespace Fortran::runtime::cuda diff --git a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp index 1e98acfd5151..9ca1bac3ec8f 100644 --- a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp +++ b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp @@ -209,3 +209,38 @@ TEST(AllocatableAsyncTest, SetStreamTest) { int stat2 = RTDECL(CUFSetAssociatedStream)(b->raw().base_addr, stream); EXPECT_EQ(stat2, StatBaseNull); } + +TEST(AllocatableAsyncTest, DestroyStreamTest) { + using Fortran::common::TypeCategory; + RTNAME(CUFRegisterAllocator)(); + // REAL(4), DEVICE, ALLOCATABLE :: a(:) + auto a{createAllocatable(TypeCategory::Real, 4)}; + a->SetAllocIdx(kDeviceAllocatorPos); + EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx()); + EXPECT_FALSE(a->HasAddendum()); + RTNAME(AllocatableSetBounds)(*a, 0, 1, 10); + + cudaStream_t stream; + cudaStreamCreate(&stream); + EXPECT_EQ(cudaSuccess, cudaGetLastError()); + + RTNAME(AllocatableAllocate) + (*a, /*asyncObject=*/(std::int64_t *)&stream, /*hasStat=*/false, + /*errMsg=*/nullptr, __FILE__, __LINE__); + EXPECT_TRUE(a->IsAllocated()); + cudaDeviceSynchronize(); + EXPECT_EQ(cudaSuccess, cudaGetLastError()); + + cudaStream_t s = RTNAME(CUFGetAssociatedStream)(a->raw().base_addr); + EXPECT_EQ(s, stream); + + RTNAME(CUFStreamDestroy)(stream); + s = RTNAME(CUFGetAssociatedStream)(a->raw().base_addr); + EXPECT_EQ(s, nullptr); + + RTNAME(AllocatableDeallocate) + (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); + EXPECT_FALSE(a->IsAllocated()); + cudaDeviceSynchronize(); + EXPECT_EQ(cudaSuccess, cudaGetLastError()); +} diff --git a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h index 5e8fbcd1b93a..6167a876f7b6 100644 --- a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h @@ -65,6 +65,8 @@ struct CUDAIntrinsicLibrary : IntrinsicLibrary { genCUDAStreamSynchronize(mlir::Type, llvm::ArrayRef); mlir::Value genCUDAStreamSynchronizeNull(mlir::Type, llvm::ArrayRef); + fir::ExtendedValue genCUDAStreamDestroy(mlir::Type, + llvm::ArrayRef); void genFenceProxyAsync(llvm::ArrayRef); template fir::ExtendedValue genLDXXFunc(mlir::Type, diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h index 698b979636da..18907252b575 100644 --- a/flang/include/flang/Runtime/CUDA/allocator.h +++ b/flang/include/flang/Runtime/CUDA/allocator.h @@ -23,6 +23,8 @@ int RTDECL(CUFSetAssociatedStream)(void *, cudaStream_t); void RTDECL(CUFRegisterAllocator)(); } +void CUFResetStream(cudaStream_t stream); + void *CUFAllocPinned(std::size_t, std::int64_t *); void CUFFreePinned(void *); diff --git a/flang/include/flang/Runtime/CUDA/stream.h b/flang/include/flang/Runtime/CUDA/stream.h index 686b828d4e14..7431ac6a27e2 100644 --- a/flang/include/flang/Runtime/CUDA/stream.h +++ b/flang/include/flang/Runtime/CUDA/stream.h @@ -23,6 +23,7 @@ int RTDECL(CUFSetDefaultStream)(cudaStream_t); cudaStream_t RTDECL(CUFGetDefaultStream)(); int RTDECL(CUFStreamSynchronize)(cudaStream_t); int RTDECL(CUFStreamSynchronizeNull)(); +int RTDECL(CUFStreamDestroy)(cudaStream_t); } } // namespace Fortran::runtime::cuda diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp index 0403a43a845c..6d8c2fcf42bd 100644 --- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp @@ -403,6 +403,11 @@ static constexpr IntrinsicHandler cudaHandlers[]{ &CI::genCUDASetDefaultStream), {{{"stream", asValue}}}, /*isElemental=*/false}, + {"cudastreamdestroy", + static_cast( + &CI::genCUDAStreamDestroy), + {{{"stream", asValue}}}, + /*isElemental=*/false}, {"fence_proxy_async", static_cast( &CI::genFenceProxyAsync), @@ -1161,6 +1166,20 @@ fir::ExtendedValue CUDAIntrinsicLibrary::genCUDASetDefaultStreamArray( return call.getResult(0); } +// CUDASTREAMDESTROY +fir::ExtendedValue CUDAIntrinsicLibrary::genCUDAStreamDestroy( + mlir::Type resTy, llvm::ArrayRef args) { + assert(args.size() == 1); + mlir::Value stream = fir::getBase(args[0]); + mlir::Type i64Ty = builder.getI64Type(); + auto ctx = builder.getContext(); + mlir::FunctionType ftype = mlir::FunctionType::get(ctx, {i64Ty}, {resTy}); + auto funcOp = + builder.createFunction(loc, RTNAME_STRING(CUFStreamDestroy), ftype); + auto call = fir::CallOp::create(builder, loc, funcOp, {stream}); + return call.getResult(0); +} + // CUDASTREAMSYNCHRONIZE fir::ExtendedValue CUDAIntrinsicLibrary::genCUDAStreamSynchronize( mlir::Type resTy, llvm::ArrayRef args) { diff --git a/flang/module/cuda_runtime_api.f90 b/flang/module/cuda_runtime_api.f90 index 7c6968cabc37..1e95bcc2f81c 100644 --- a/flang/module/cuda_runtime_api.f90 +++ b/flang/module/cuda_runtime_api.f90 @@ -36,4 +36,12 @@ interface cudaforsetdefaultstream end function end interface +interface cudastreamdestroy + integer function cudastreamdestroy(stream) + import cuda_stream_kind + !DIR$ IGNORE_TKR (K) stream + integer(kind=cuda_stream_kind), value :: stream + end function +end interface + end module cuda_runtime_api diff --git a/flang/test/Lower/CUDA/cuda-default-stream.cuf b/flang/test/Lower/CUDA/cuda-default-stream.cuf index b9fe1f794959..5fc7de68b47d 100644 --- a/flang/test/Lower/CUDA/cuda-default-stream.cuf +++ b/flang/test/Lower/CUDA/cuda-default-stream.cuf @@ -39,3 +39,13 @@ end subroutine ! CHECK: %{{.*}} = fir.call @_FortranACUFGetDefaultStream() fastmath : () -> i64 ! CHECK: %{{.*}} = fir.call @_FortranACUFGetDefaultStream() fastmath : () -> i64 +subroutine stream_destroy + use cuda_runtime_api + integer(kind=cuda_stream_kind) :: strm + integer :: istat + istat = cudaStreamCreate(strm) + istat = cudaStreamDestroy(strm) +end subroutine + +! CHECK-LABEL: func.func @_QPstream_destroy() +! CHECK: %{{.*}} = fir.call @_FortranACUFStreamDestroy(%{{.*}}) fastmath : (i64) -> i32