From 2c6771889a7bc96c0f7010214c5e1e87d86a7bf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Mon, 14 Jul 2025 17:23:18 -0700 Subject: [PATCH] [flang][cuda] Introduce cuf.set_allocator_idx operation (#148717) --- flang-rt/lib/cuda/descriptor.cpp | 9 +++++ .../unittests/Runtime/CUDA/AllocatorCUF.cpp | 10 ++++++ .../Builder/Runtime/CUDA/Descriptor.h | 4 +++ .../flang/Optimizer/Dialect/CUF/CUFOps.td | 21 ++++++++++++ flang/include/flang/Runtime/CUDA/descriptor.h | 4 +++ .../Builder/Runtime/CUDA/Descriptor.cpp | 15 +++++++++ flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp | 11 +++++++ .../Optimizer/Transforms/CUFOpConversion.cpp | 33 +++++++++++++++++-- flang/test/Fir/CUDA/cuda-alloc-free.fir | 15 +++++++++ 9 files changed, 120 insertions(+), 2 deletions(-) diff --git a/flang-rt/lib/cuda/descriptor.cpp b/flang-rt/lib/cuda/descriptor.cpp index aa75d4eff051..f81316cc0173 100644 --- a/flang-rt/lib/cuda/descriptor.cpp +++ b/flang-rt/lib/cuda/descriptor.cpp @@ -62,6 +62,15 @@ void RTDEF(CUFDescriptorCheckSection)( } } +void RTDEF(CUFSetAllocatorIndex)( + Descriptor *, int index, const char *sourceFile, int sourceLine) { + if (!desc) { + Terminator terminator{sourceFile, sourceLine}; + terminator.Crash("descriptor is null"); + } + desc->SetAllocIdx(index); +} + RT_EXT_API_GROUP_END } } // namespace Fortran::runtime::cuda diff --git a/flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp b/flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp index f1f931e87a86..83aa37f8d06f 100644 --- a/flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp +++ b/flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp @@ -72,3 +72,13 @@ TEST(AllocatableCUFTest, DescriptorAllocationTest) { EXPECT_TRUE(desc != nullptr); RTNAME(CUFFreeDescriptor)(desc); } + +TEST(AllocatableCUFTest, CUFSetAllocatorIndex) { + using Fortran::common::TypeCategory; + RTNAME(CUFRegisterAllocator)(); + // REAL(4), DEVICE, ALLOCATABLE :: a(:) + auto a{createAllocatable(TypeCategory::Real, 4)}; + EXPECT_EQ((int)kDefaultAllocator, a->GetAllocIdx()); + RTNAME(CUFSetAllocatorIndex)(*a, kDeviceAllocatorPos, __FILE__, __LINE__); + EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx()); +} diff --git a/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h b/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h index bdeb7574012c..43dca65322a6 100644 --- a/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h +++ b/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h @@ -31,6 +31,10 @@ void genSyncGlobalDescriptor(fir::FirOpBuilder &builder, mlir::Location loc, void genDescriptorCheckSection(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value desc); +/// Generate runtime call to set the allocator index in the descriptor. +void genSetAllocatorIndex(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value desc, mlir::Value index); + } // namespace fir::runtime::cuda #endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_CUDA_DESCRIPTOR_H_ diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td index e38738230ffb..23ab88261520 100644 --- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td +++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td @@ -388,4 +388,25 @@ def cuf_StreamCastOp : cuf_Op<"stream_cast", [NoMemoryEffect]> { let hasVerifier = 1; } +def cuf_SetAllocatorIndexOp : cuf_Op<"set_allocator_idx", []> { + let summary = "Set the allocator index in a descriptor"; + + let description = [{ + Allocator index in the Fortran descriptor is used to retrived the correct + CUDA allocator to allocate the memory on the device. + In many cases the allocator index is set when the descriptor is created. For + device components, the descriptor is part of the derived-type itself and + needs to be set after the derived-type is allocated in managed memory. + }]; + + let arguments = (ins Arg:$box, + cuf_DataAttributeAttr:$data_attr); + + let assemblyFormat = [{ + $box `:` qualified(type($box)) attr-dict + }]; + + let hasVerifier = 1; +} + #endif // FORTRAN_DIALECT_CUF_CUF_OPS diff --git a/flang/include/flang/Runtime/CUDA/descriptor.h b/flang/include/flang/Runtime/CUDA/descriptor.h index 06e4a4649db1..7555f276ac1d 100644 --- a/flang/include/flang/Runtime/CUDA/descriptor.h +++ b/flang/include/flang/Runtime/CUDA/descriptor.h @@ -41,6 +41,10 @@ void RTDECL(CUFSyncGlobalDescriptor)( void RTDECL(CUFDescriptorCheckSection)( const Descriptor *, const char *sourceFile = nullptr, int sourceLine = 0); +/// Set the allocator index with the provided value. +void RTDECL(CUFSetAllocatorIndex)(Descriptor *, int index, + const char *sourceFile = nullptr, int sourceLine = 0); + } // extern "C" } // namespace Fortran::runtime::cuda diff --git a/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp b/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp index a943469a7672..62a0652cc2e5 100644 --- a/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp +++ b/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp @@ -47,3 +47,18 @@ void fir::runtime::cuda::genDescriptorCheckSection(fir::FirOpBuilder &builder, builder, loc, fTy, desc, sourceFile, sourceLine)}; builder.create(loc, func, args); } + +void fir::runtime::cuda::genSetAllocatorIndex(fir::FirOpBuilder &builder, + mlir::Location loc, + mlir::Value desc, + mlir::Value index) { + mlir::func::FuncOp func = + fir::runtime::getRuntimeFunc(loc, builder); + auto fTy = func.getFunctionType(); + mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc); + mlir::Value sourceLine = + fir::factory::locationToLineNo(builder, loc, fTy.getInput(3)); + llvm::SmallVector args{fir::runtime::createArguments( + builder, loc, fTy, desc, index, sourceFile, sourceLine)}; + builder.create(loc, func, args); +} diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp index 687007d95722..ade80716f256 100644 --- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp +++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp @@ -345,6 +345,17 @@ llvm::LogicalResult cuf::StreamCastOp::verify() { return checkStreamType(*this); } +//===----------------------------------------------------------------------===// +// SetAllocatorOp +//===----------------------------------------------------------------------===// + +llvm::LogicalResult cuf::SetAllocatorIndexOp::verify() { + if (!mlir::isa(fir::unwrapRefType(getBox().getType()))) + return emitOpError( + "expect box to be a reference to class or box type value"); + return mlir::success(); +} + // Tablegen operators #define GET_OP_CLASSES diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index 0fff06033b73..750569c12664 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -22,6 +22,7 @@ #include "flang/Runtime/CUDA/memory.h" #include "flang/Runtime/CUDA/pointer.h" #include "flang/Runtime/allocatable.h" +#include "flang/Runtime/allocator-registry-consts.h" #include "flang/Support/Fortran.h" #include "mlir/Conversion/LLVMCommon/Pattern.h" #include "mlir/Dialect/DLTI/DLTI.h" @@ -923,6 +924,34 @@ struct CUFSyncDescriptorOpConversion } }; +struct CUFSetAllocatorIndexOpConversion + : public mlir::OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + mlir::LogicalResult + matchAndRewrite(cuf::SetAllocatorIndexOp op, + mlir::PatternRewriter &rewriter) const override { + auto mod = op->getParentOfType(); + fir::FirOpBuilder builder(rewriter, mod); + mlir::Location loc = op.getLoc(); + int idx = kDefaultAllocator; + if (op.getDataAttr() == cuf::DataAttribute::Device) { + idx = kDeviceAllocatorPos; + } else if (op.getDataAttr() == cuf::DataAttribute::Managed) { + idx = kManagedAllocatorPos; + } else if (op.getDataAttr() == cuf::DataAttribute::Unified) { + idx = kUnifiedAllocatorPos; + } else if (op.getDataAttr() == cuf::DataAttribute::Pinned) { + idx = kPinnedAllocatorPos; + } + mlir::Value index = + builder.createIntegerConstant(loc, builder.getI32Type(), idx); + fir::runtime::cuda::genSetAllocatorIndex(builder, loc, op.getBox(), index); + op.erase(); + return mlir::success(); + } +}; + class CUFOpConversion : public fir::impl::CUFOpConversionBase { public: void runOnOperation() override { @@ -984,8 +1013,8 @@ void cuf::populateCUFToFIRConversionPatterns( const mlir::SymbolTable &symtab, mlir::RewritePatternSet &patterns) { patterns.insert(patterns.getContext(), &dl, &converter); patterns.insert( - patterns.getContext()); + CUFFreeOpConversion, CUFSyncDescriptorOpConversion, + CUFSetAllocatorIndexOpConversion>(patterns.getContext()); patterns.insert(patterns.getContext(), symtab, &dl, &converter); patterns.insert( diff --git a/flang/test/Fir/CUDA/cuda-alloc-free.fir b/flang/test/Fir/CUDA/cuda-alloc-free.fir index 31f2ed022b6c..8b6e7d67931d 100644 --- a/flang/test/Fir/CUDA/cuda-alloc-free.fir +++ b/flang/test/Fir/CUDA/cuda-alloc-free.fir @@ -94,4 +94,19 @@ func.func @_QQalloc_char() attributes {fir.bindc_name = "alloc_char"} { // CHECK: %[[BYTES_CONV:.*]] = fir.convert %[[BYTES]] : (index) -> i64 // CHECK: fir.call @_FortranACUFMemAlloc(%[[BYTES_CONV]], %c0{{.*}}, %{{.*}}, %{{.*}}) {cuf.data_attr = #cuf.cuda} : (i64, i32, !fir.ref, i32) -> !fir.llvm_ptr + +func.func @_QQsetalloc() { + %0 = cuf.alloc !fir.type<_QMm1Tdt1{a2:!fir.box>>}> {bindc_name = "d1", data_attr = #cuf.cuda, uniq_name = "_QFEd1"} -> !fir.ref>>}>> + %1 = fir.coordinate_of %0, a2 : (!fir.ref>>}>>) -> !fir.ref>>> + cuf.set_allocator_idx %1 : !fir.ref>>> {data_attr = #cuf.cuda} + return +} + +// CHECK-LABEL: func.func @_QQsetalloc() { +// CHECK: %[[DT:.*]] = fir.call @_FortranACUFMemAlloc +// CHECK: %[[CONV:.*]] = fir.convert %[[DT]] : (!fir.llvm_ptr) -> !fir.ref>>}>> +// CHECK: %[[COMP:.*]] = fir.coordinate_of %[[CONV]], a2 : (!fir.ref>>}>>) -> !fir.ref>>> +// CHECK: %[[DESC:.*]] = fir.convert %[[COMP]] : (!fir.ref>>>) -> !fir.ref> +// CHECK: fir.call @_FortranACUFSetAllocatorIndex(%[[DESC]], %c2{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, i32, !fir.ref, i32) -> () + } // end module