From c4170461d7fc5ddd0423c5f7325abd0a7dfc6277 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Fri, 13 Feb 2026 15:44:38 -0800
Subject: [PATCH] [flang][cuda] Lower set/get default stream for arrays
 (#181432)

---
 flang-rt/lib/cuda/allocator.cpp               |  6 +--
 .../unittests/Runtime/CUDA/Allocatable.cpp    |  3 +-
 .../Optimizer/Builder/CUDAIntrinsicCall.h     |  6 +++
 flang/include/flang/Runtime/CUDA/allocator.h  |  4 +-
 .../Optimizer/Builder/CUDAIntrinsicCall.cpp   | 51 +++++++++++++++++++
 flang/module/cuda_runtime_api.f90             | 39 ++++++++++++++
 flang/test/Lower/CUDA/cuda-default-stream.cuf | 24 +++++++++
 flang/tools/f18/CMakeLists.txt                |  4 +-
 8 files changed, 127 insertions(+), 10 deletions(-)
 create mode 100644 flang/module/cuda_runtime_api.f90
 create mode 100644 flang/test/Lower/CUDA/cuda-default-stream.cuf
diff --git a/flang-rt/lib/cuda/allocator.cpp b/flang-rt/lib/cuda/allocator.cpp
index 917b279b38f3..df7e43de00c7 100644
--- a/flang-rt/lib/cuda/allocator.cpp
+++ b/flang-rt/lib/cuda/allocator.cpp
@@ -141,11 +141,9 @@ cudaStream_t RTDECL(CUFGetAssociatedStream)(void *p) {
   return nullptr;
 }
 
-int RTDECL(CUFSetAssociatedStream)(void *p, cudaStream_t stream, bool hasStat,
-    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
-  Terminator terminator{sourceFile, sourceLine};
+int RTDECL(CUFSetAssociatedStream)(void *p, cudaStream_t stream) {
   if (p == nullptr) {
-    return ReturnError(terminator, StatBaseNull, errMsg, hasStat);
+    return StatBaseNull;
   }
   int pos = findAllocation(p);
   if (pos >= 0) {
diff --git a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
index e308e8c8bdad..0680c0086ea7 100644
--- a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
+++ b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
@@ -205,7 +205,6 @@ TEST(AllocatableAsyncTest, SetStreamTest) {
 
   // REAL(4), DEVICE, ALLOCATABLE :: b(:) - unallocated, base_addr is null
   auto b{createAllocatable(TypeCategory::Real, 4)};
-  int stat2 = RTDECL(CUFSetAssociatedStream)(
-      b->raw().base_addr, stream, true, nullptr, __FILE__, __LINE__);
+  int stat2 = RTDECL(CUFSetAssociatedStream)(b->raw().base_addr, stream);
   EXPECT_EQ(stat2, StatBaseNull);
 }
diff --git a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
index e9b6e5cf2393..d92f0c72dde0 100644
--- a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
@@ -51,6 +51,12 @@ struct CUDAIntrinsicLibrary : IntrinsicLibrary {
   mlir::Value genBarrierTryWaitSleep(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genClusterBlockIndex(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genClusterDimBlocks(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  fir::ExtendedValue
+      genCUDASetDefaultStreamArray(mlir::Type,
+                                   llvm::ArrayRef<fir::ExtendedValue>);
+  fir::ExtendedValue
+      genCUDAGetDefaultStreamArg(mlir::Type,
+                                 llvm::ArrayRef<fir::ExtendedValue>);
   void genFenceProxyAsync(llvm::ArrayRef<fir::ExtendedValue>);
   template <const char *fctName, int extent>
   fir::ExtendedValue genLDXXFunc(mlir::Type,
diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h
index 6a64bdeccbc2..1493383a559c 100644
--- a/flang/include/flang/Runtime/CUDA/allocator.h
+++ b/flang/include/flang/Runtime/CUDA/allocator.h
@@ -21,9 +21,7 @@ extern "C" {
 
 void RTDECL(CUFRegisterAllocator)();
 cudaStream_t RTDECL(CUFGetAssociatedStream)(void *);
-int RTDECL(CUFSetAssociatedStream)(void *, cudaStream_t, bool hasStat = false,
-    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+int RTDECL(CUFSetAssociatedStream)(void *, cudaStream_t);
 }
 
 void *CUFAllocPinned(std::size_t, std::int64_t *);
diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
index fe2db4607f86..4c4403dcd71a 100644
--- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
@@ -19,6 +19,7 @@
 #include "flang/Optimizer/Builder/MutableBox.h"
 #include "flang/Optimizer/Dialect/CUF/CUFOps.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "flang/Runtime/entry-names.h"
 #include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
@@ -382,6 +383,16 @@ static constexpr IntrinsicHandler cudaHandlers[]{
          &CI::genClusterDimBlocks),
      {},
      /*isElemental=*/false},
+    {"cudagetstreamdefaultarg",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genCUDAGetDefaultStreamArg),
+     {{{"devptr", asAddr}}},
+     /*isElemental=*/false},
+    {"cudasetstreamarray",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genCUDASetDefaultStreamArray),
+     {{{"devptr", asAddr}, {"stream", asValue}}},
+     /*isElemental=*/false},
     {"fence_proxy_async",
      static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
          &CI::genFenceProxyAsync),
@@ -1103,6 +1114,46 @@ CUDAIntrinsicLibrary::genClusterDimBlocks(mlir::Type resultType,
   return res;
 }
 
+// CUDASETSTREAMARRAY
+fir::ExtendedValue CUDAIntrinsicLibrary::genCUDASetDefaultStreamArray(
+    mlir::Type resTy, llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+  mlir::Value arg = fir::getBase(args[0]);
+  mlir::Value stream = fir::getBase(args[1]);
+
+  if (mlir::isa<fir::BaseBoxType>(arg.getType()))
+    arg = fir::BoxAddrOp::create(builder, loc, arg);
+  mlir::Type i64Ty = builder.getI64Type();
+  mlir::Type i32Ty = builder.getI32Type();
+  auto ctx = builder.getContext();
+  mlir::Type voidPtrTy =
+      fir::LLVMPointerType::get(ctx, mlir::IntegerType::get(ctx, 8));
+  mlir::FunctionType ftype =
+      mlir::FunctionType::get(ctx, {voidPtrTy, i64Ty}, {i32Ty});
+  mlir::Value voidPtr = builder.createConvert(loc, voidPtrTy, arg);
+  auto funcOp =
+      builder.createFunction(loc, RTNAME_STRING(CUFSetAssociatedStream), ftype);
+  auto call = fir::CallOp::create(builder, loc, funcOp, {voidPtr, stream});
+  return call.getResult(0);
+}
+
+// CUDAGETDEFAULTSTREAMARG
+fir::ExtendedValue CUDAIntrinsicLibrary::genCUDAGetDefaultStreamArg(
+    mlir::Type resultType, llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 1);
+  mlir::Value devptr = fir::getBase(args[0]);
+  mlir::Type i64Ty = builder.getI64Type();
+  auto ctx = builder.getContext();
+  mlir::Type voidPtrTy =
+      fir::LLVMPointerType::get(ctx, mlir::IntegerType::get(ctx, 8));
+  mlir::FunctionType ftype = mlir::FunctionType::get(ctx, {voidPtrTy}, {i64Ty});
+  mlir::Value voidPtr = builder.createConvert(loc, voidPtrTy, devptr);
+  auto funcOp =
+      builder.createFunction(loc, RTNAME_STRING(CUFGetAssociatedStream), ftype);
+  auto call = fir::CallOp::create(builder, loc, funcOp, {voidPtr});
+  return call.getResult(0);
+}
+
 // FENCE_PROXY_ASYNC
 void CUDAIntrinsicLibrary::genFenceProxyAsync(
     llvm::ArrayRef<fir::ExtendedValue> args) {
diff --git a/flang/module/cuda_runtime_api.f90 b/flang/module/cuda_runtime_api.f90
new file mode 100644
index 000000000000..d6cb6d8c0f71
--- /dev/null
+++ b/flang/module/cuda_runtime_api.f90
@@ -0,0 +1,39 @@
+!===-- module/cuda_runtime_api.f90 -----------------------------------------===!
+!
+! Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+! See https://llvm.org/LICENSE.txt for license information.
+! SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+!
+!===------------------------------------------------------------------------===!
+
+module cuda_runtime_api
+implicit none
+
+integer, parameter :: cuda_stream_kind = int_ptr_kind()
+
+interface cudaforgetdefaultstream
+  integer(kind=cuda_stream_kind) function cudagetstreamdefaultarg(devptr)
+    import cuda_stream_kind
+    !DIR$ IGNORE_TKR (TKR) devptr
+    integer, device  :: devptr(*)
+  end function
+  integer(kind=cuda_stream_kind) function cudastreamgetdefaultnull()
+    import cuda_stream_kind
+  end function
+end interface
+
+interface cudaforsetdefaultstream
+  integer function cudasetdefaultstream(stream)
+    import cuda_stream_kind
+    !DIR$ IGNORE_TKR (K) stream
+    integer(kind=cuda_stream_kind), value :: stream
+  end function
+  integer function cudasetstreamarray(devptr, stream)
+    import cuda_stream_kind
+    !DIR$ IGNORE_TKR (K) stream, (TKR) devptr
+    integer, device  :: devptr(*)
+    integer(kind=cuda_stream_kind), value :: stream
+  end function
+end interface
+
+end module cuda_runtime_api
diff --git a/flang/test/Lower/CUDA/cuda-default-stream.cuf b/flang/test/Lower/CUDA/cuda-default-stream.cuf
new file mode 100644
index 000000000000..59c6bc6b7061
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-default-stream.cuf
@@ -0,0 +1,24 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+subroutine associated_stream
+  use cuda_runtime_api
+  integer(kind=cuda_stream_kind) :: strm, strmout
+  integer, managed, allocatable :: v(:)
+  integer :: istat
+
+  istat = cudaforSetDefaultStream(v, strm)
+  strmout = cudaforGetDefaultStream(v)
+  
+end subroutine
+
+! CHECK-LABEL: func.func @_QPassociated_stream()
+! CHECK: %[[ADDR:.*]] = fir.box_addr %{{.*}} : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+! CHECK: %[[STREAM:.*]] = fir.load %{{.*}}#0 : !fir.ref<i64>
+! CHECK: %[[VOIDPTR:.*]] = fir.convert %[[ADDR]] : (!fir.heap<!fir.array<?xi32>>) -> !fir.llvm_ptr<i8>
+! CHECK: %[[STAT:.*]] = fir.call @_FortranACUFSetAssociatedStream(%[[VOIDPTR]], %[[STREAM]]) fastmath<contract> : (!fir.llvm_ptr<i8>, i64) -> i32
+! CHECK: hlfir.assign %[[STAT]] to %{{.*}}#0 : i32, !fir.ref<i32>
+
+! CHECK: %[[ADDR:.*]] = fir.box_addr %{{.*}} : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+! CHECK: %[[VOIDPTR:.*]] = fir.convert %[[ADDR]] : (!fir.heap<!fir.array<?xi32>>) -> !fir.llvm_ptr<i8>
+! CHECK: %[[STREAM:.*]] = fir.call @_FortranACUFGetAssociatedStream(%[[VOIDPTR]]) fastmath<contract> : (!fir.llvm_ptr<i8>) -> i64
+! CHECK: hlfir.assign %[[STREAM]] to %{{.*}}#0 : i64, !fir.ref<i64>
diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt
index ffd92f033840..74b329f6c6c0 100644
--- a/flang/tools/f18/CMakeLists.txt
+++ b/flang/tools/f18/CMakeLists.txt
@@ -16,6 +16,7 @@ set(MODULES
   "__cuda_builtins"
   "__cuda_device"
   "cooperative_groups"
+  "cuda_runtime_api"
   "cudadevice"
   "ieee_arithmetic"
   "ieee_exceptions"
@@ -64,7 +65,8 @@ if (NOT CMAKE_CROSSCOMPILING)
       set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__ppc_types.mod)
     elseif(${filename} STREQUAL "__cuda_device" OR
            ${filename} STREQUAL "cudadevice" OR
-           ${filename} STREQUAL "cooperative_groups")
+           ${filename} STREQUAL "cooperative_groups" OR
+           ${filename} STREQUAL "cuda_runtime_api")
       set(opts -fc1 -xcuda)
       if(${filename} STREQUAL "__cuda_device")
         set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__cuda_builtins.mod)