From c4170461d7fc5ddd0423c5f7325abd0a7dfc6277 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Fri, 13 Feb 2026 15:44:38 -0800 Subject: [PATCH] [flang][cuda] Lower set/get default stream for arrays (#181432) --- flang-rt/lib/cuda/allocator.cpp | 6 +-- .../unittests/Runtime/CUDA/Allocatable.cpp | 3 +- .../Optimizer/Builder/CUDAIntrinsicCall.h | 6 +++ flang/include/flang/Runtime/CUDA/allocator.h | 4 +- .../Optimizer/Builder/CUDAIntrinsicCall.cpp | 51 +++++++++++++++++++ flang/module/cuda_runtime_api.f90 | 39 ++++++++++++++ flang/test/Lower/CUDA/cuda-default-stream.cuf | 24 +++++++++ flang/tools/f18/CMakeLists.txt | 4 +- 8 files changed, 127 insertions(+), 10 deletions(-) create mode 100644 flang/module/cuda_runtime_api.f90 create mode 100644 flang/test/Lower/CUDA/cuda-default-stream.cuf diff --git a/flang-rt/lib/cuda/allocator.cpp b/flang-rt/lib/cuda/allocator.cpp index 917b279b38f3..df7e43de00c7 100644 --- a/flang-rt/lib/cuda/allocator.cpp +++ b/flang-rt/lib/cuda/allocator.cpp @@ -141,11 +141,9 @@ cudaStream_t RTDECL(CUFGetAssociatedStream)(void *p) { return nullptr; } -int RTDECL(CUFSetAssociatedStream)(void *p, cudaStream_t stream, bool hasStat, - const Descriptor *errMsg, const char *sourceFile, int sourceLine) { - Terminator terminator{sourceFile, sourceLine}; +int RTDECL(CUFSetAssociatedStream)(void *p, cudaStream_t stream) { if (p == nullptr) { - return ReturnError(terminator, StatBaseNull, errMsg, hasStat); + return StatBaseNull; } int pos = findAllocation(p); if (pos >= 0) { diff --git a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp index e308e8c8bdad..0680c0086ea7 100644 --- a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp +++ b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp @@ -205,7 +205,6 @@ TEST(AllocatableAsyncTest, SetStreamTest) { // REAL(4), DEVICE, ALLOCATABLE :: b(:) - unallocated, base_addr is null auto b{createAllocatable(TypeCategory::Real, 4)}; - int stat2 = RTDECL(CUFSetAssociatedStream)( - b->raw().base_addr, stream, true, nullptr, __FILE__, __LINE__); + int stat2 = RTDECL(CUFSetAssociatedStream)(b->raw().base_addr, stream); EXPECT_EQ(stat2, StatBaseNull); } diff --git a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h index e9b6e5cf2393..d92f0c72dde0 100644 --- a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h @@ -51,6 +51,12 @@ struct CUDAIntrinsicLibrary : IntrinsicLibrary { mlir::Value genBarrierTryWaitSleep(mlir::Type, llvm::ArrayRef); mlir::Value genClusterBlockIndex(mlir::Type, llvm::ArrayRef); mlir::Value genClusterDimBlocks(mlir::Type, llvm::ArrayRef); + fir::ExtendedValue + genCUDASetDefaultStreamArray(mlir::Type, + llvm::ArrayRef); + fir::ExtendedValue + genCUDAGetDefaultStreamArg(mlir::Type, + llvm::ArrayRef); void genFenceProxyAsync(llvm::ArrayRef); template fir::ExtendedValue genLDXXFunc(mlir::Type, diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h index 6a64bdeccbc2..1493383a559c 100644 --- a/flang/include/flang/Runtime/CUDA/allocator.h +++ b/flang/include/flang/Runtime/CUDA/allocator.h @@ -21,9 +21,7 @@ extern "C" { void RTDECL(CUFRegisterAllocator)(); cudaStream_t RTDECL(CUFGetAssociatedStream)(void *); -int RTDECL(CUFSetAssociatedStream)(void *, cudaStream_t, bool hasStat = false, - const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr, - int sourceLine = 0); +int RTDECL(CUFSetAssociatedStream)(void *, cudaStream_t); } void *CUFAllocPinned(std::size_t, std::int64_t *); diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp index fe2db4607f86..4c4403dcd71a 100644 --- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp @@ -19,6 +19,7 @@ #include "flang/Optimizer/Builder/MutableBox.h" #include "flang/Optimizer/Dialect/CUF/CUFOps.h" #include "flang/Optimizer/HLFIR/HLFIROps.h" +#include "flang/Runtime/entry-names.h" #include "mlir/Dialect/Index/IR/IndexOps.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" @@ -382,6 +383,16 @@ static constexpr IntrinsicHandler cudaHandlers[]{ &CI::genClusterDimBlocks), {}, /*isElemental=*/false}, + {"cudagetstreamdefaultarg", + static_cast( + &CI::genCUDAGetDefaultStreamArg), + {{{"devptr", asAddr}}}, + /*isElemental=*/false}, + {"cudasetstreamarray", + static_cast( + &CI::genCUDASetDefaultStreamArray), + {{{"devptr", asAddr}, {"stream", asValue}}}, + /*isElemental=*/false}, {"fence_proxy_async", static_cast( &CI::genFenceProxyAsync), @@ -1103,6 +1114,46 @@ CUDAIntrinsicLibrary::genClusterDimBlocks(mlir::Type resultType, return res; } +// CUDASETSTREAMARRAY +fir::ExtendedValue CUDAIntrinsicLibrary::genCUDASetDefaultStreamArray( + mlir::Type resTy, llvm::ArrayRef args) { + assert(args.size() == 2); + mlir::Value arg = fir::getBase(args[0]); + mlir::Value stream = fir::getBase(args[1]); + + if (mlir::isa(arg.getType())) + arg = fir::BoxAddrOp::create(builder, loc, arg); + mlir::Type i64Ty = builder.getI64Type(); + mlir::Type i32Ty = builder.getI32Type(); + auto ctx = builder.getContext(); + mlir::Type voidPtrTy = + fir::LLVMPointerType::get(ctx, mlir::IntegerType::get(ctx, 8)); + mlir::FunctionType ftype = + mlir::FunctionType::get(ctx, {voidPtrTy, i64Ty}, {i32Ty}); + mlir::Value voidPtr = builder.createConvert(loc, voidPtrTy, arg); + auto funcOp = + builder.createFunction(loc, RTNAME_STRING(CUFSetAssociatedStream), ftype); + auto call = fir::CallOp::create(builder, loc, funcOp, {voidPtr, stream}); + return call.getResult(0); +} + +// CUDAGETDEFAULTSTREAMARG +fir::ExtendedValue CUDAIntrinsicLibrary::genCUDAGetDefaultStreamArg( + mlir::Type resultType, llvm::ArrayRef args) { + assert(args.size() == 1); + mlir::Value devptr = fir::getBase(args[0]); + mlir::Type i64Ty = builder.getI64Type(); + auto ctx = builder.getContext(); + mlir::Type voidPtrTy = + fir::LLVMPointerType::get(ctx, mlir::IntegerType::get(ctx, 8)); + mlir::FunctionType ftype = mlir::FunctionType::get(ctx, {voidPtrTy}, {i64Ty}); + mlir::Value voidPtr = builder.createConvert(loc, voidPtrTy, devptr); + auto funcOp = + builder.createFunction(loc, RTNAME_STRING(CUFGetAssociatedStream), ftype); + auto call = fir::CallOp::create(builder, loc, funcOp, {voidPtr}); + return call.getResult(0); +} + // FENCE_PROXY_ASYNC void CUDAIntrinsicLibrary::genFenceProxyAsync( llvm::ArrayRef args) { diff --git a/flang/module/cuda_runtime_api.f90 b/flang/module/cuda_runtime_api.f90 new file mode 100644 index 000000000000..d6cb6d8c0f71 --- /dev/null +++ b/flang/module/cuda_runtime_api.f90 @@ -0,0 +1,39 @@ +!===-- module/cuda_runtime_api.f90 -----------------------------------------===! +! +! Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +! See https://llvm.org/LICENSE.txt for license information. +! SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +! +!===------------------------------------------------------------------------===! + +module cuda_runtime_api +implicit none + +integer, parameter :: cuda_stream_kind = int_ptr_kind() + +interface cudaforgetdefaultstream + integer(kind=cuda_stream_kind) function cudagetstreamdefaultarg(devptr) + import cuda_stream_kind + !DIR$ IGNORE_TKR (TKR) devptr + integer, device :: devptr(*) + end function + integer(kind=cuda_stream_kind) function cudastreamgetdefaultnull() + import cuda_stream_kind + end function +end interface + +interface cudaforsetdefaultstream + integer function cudasetdefaultstream(stream) + import cuda_stream_kind + !DIR$ IGNORE_TKR (K) stream + integer(kind=cuda_stream_kind), value :: stream + end function + integer function cudasetstreamarray(devptr, stream) + import cuda_stream_kind + !DIR$ IGNORE_TKR (K) stream, (TKR) devptr + integer, device :: devptr(*) + integer(kind=cuda_stream_kind), value :: stream + end function +end interface + +end module cuda_runtime_api diff --git a/flang/test/Lower/CUDA/cuda-default-stream.cuf b/flang/test/Lower/CUDA/cuda-default-stream.cuf new file mode 100644 index 000000000000..59c6bc6b7061 --- /dev/null +++ b/flang/test/Lower/CUDA/cuda-default-stream.cuf @@ -0,0 +1,24 @@ +! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s + +subroutine associated_stream + use cuda_runtime_api + integer(kind=cuda_stream_kind) :: strm, strmout + integer, managed, allocatable :: v(:) + integer :: istat + + istat = cudaforSetDefaultStream(v, strm) + strmout = cudaforGetDefaultStream(v) + +end subroutine + +! CHECK-LABEL: func.func @_QPassociated_stream() +! CHECK: %[[ADDR:.*]] = fir.box_addr %{{.*}} : (!fir.box>>) -> !fir.heap> +! CHECK: %[[STREAM:.*]] = fir.load %{{.*}}#0 : !fir.ref +! CHECK: %[[VOIDPTR:.*]] = fir.convert %[[ADDR]] : (!fir.heap>) -> !fir.llvm_ptr +! CHECK: %[[STAT:.*]] = fir.call @_FortranACUFSetAssociatedStream(%[[VOIDPTR]], %[[STREAM]]) fastmath : (!fir.llvm_ptr, i64) -> i32 +! CHECK: hlfir.assign %[[STAT]] to %{{.*}}#0 : i32, !fir.ref + +! CHECK: %[[ADDR:.*]] = fir.box_addr %{{.*}} : (!fir.box>>) -> !fir.heap> +! CHECK: %[[VOIDPTR:.*]] = fir.convert %[[ADDR]] : (!fir.heap>) -> !fir.llvm_ptr +! CHECK: %[[STREAM:.*]] = fir.call @_FortranACUFGetAssociatedStream(%[[VOIDPTR]]) fastmath : (!fir.llvm_ptr) -> i64 +! CHECK: hlfir.assign %[[STREAM]] to %{{.*}}#0 : i64, !fir.ref diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt index ffd92f033840..74b329f6c6c0 100644 --- a/flang/tools/f18/CMakeLists.txt +++ b/flang/tools/f18/CMakeLists.txt @@ -16,6 +16,7 @@ set(MODULES "__cuda_builtins" "__cuda_device" "cooperative_groups" + "cuda_runtime_api" "cudadevice" "ieee_arithmetic" "ieee_exceptions" @@ -64,7 +65,8 @@ if (NOT CMAKE_CROSSCOMPILING) set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__ppc_types.mod) elseif(${filename} STREQUAL "__cuda_device" OR ${filename} STREQUAL "cudadevice" OR - ${filename} STREQUAL "cooperative_groups") + ${filename} STREQUAL "cooperative_groups" OR + ${filename} STREQUAL "cuda_runtime_api") set(opts -fc1 -xcuda) if(${filename} STREQUAL "__cuda_device") set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__cuda_builtins.mod)