From a2899c457ecac9f2511fa08926bcf1c22eee1d14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 13 Aug 2025 10:55:15 -0700 Subject: [PATCH] [flang][cuda] Support data transfer with conversion (#153242) When the rhs of the data transfer is from a different type, allocate a new temp on the host and first transfer the rhs to it. Then, use the elemental op created to do the conversion. --- flang/include/flang/Lower/CUDA.h | 2 + flang/lib/Lower/Bridge.cpp | 30 +++++++++++- flang/lib/Lower/CUDA.cpp | 9 ++++ flang/test/Lower/CUDA/cuda-data-transfer.cuf | 48 ++++++++++++++++++++ 4 files changed, 87 insertions(+), 2 deletions(-) diff --git a/flang/include/flang/Lower/CUDA.h b/flang/include/flang/Lower/CUDA.h index 6c2e6d71a123..4a831fd502af 100644 --- a/flang/include/flang/Lower/CUDA.h +++ b/flang/include/flang/Lower/CUDA.h @@ -62,6 +62,8 @@ cuf::DataAttributeAttr translateSymbolCUFDataAttribute(mlir::MLIRContext *mlirContext, const Fortran::semantics::Symbol &sym); +bool isTransferWithConversion(mlir::Value rhs); + } // end namespace Fortran::lower #endif // FORTRAN_LOWER_CUDA_H diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index d16488d44454..b636416ea8c1 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -4827,7 +4827,9 @@ private: void genCUDADataTransfer(fir::FirOpBuilder &builder, mlir::Location loc, const Fortran::evaluate::Assignment &assign, - hlfir::Entity &lhs, hlfir::Entity &rhs) { + hlfir::Entity &lhs, hlfir::Entity &rhs, + bool isWholeAllocatableAssignment, + bool keepLhsLengthInAllocatableAssignment) { bool lhsIsDevice = Fortran::evaluate::HasCUDADeviceAttrs(assign.lhs); bool rhsIsDevice = Fortran::evaluate::HasCUDADeviceAttrs(assign.rhs); @@ -4892,6 +4894,28 @@ private: // host = device if (!lhsIsDevice && rhsIsDevice) { + if (Fortran::lower::isTransferWithConversion(rhs)) { + mlir::OpBuilder::InsertionGuard insertionGuard(builder); + auto elementalOp = + mlir::dyn_cast(rhs.getDefiningOp()); + assert(elementalOp && "expect elemental op"); + auto designateOp = + *elementalOp.getBody()->getOps().begin(); + builder.setInsertionPoint(elementalOp); + // Create a temp to transfer the rhs before applying the conversion. + hlfir::Entity entity{designateOp.getMemref()}; + auto [temp, cleanup] = hlfir::createTempFromMold(loc, builder, entity); + auto transferKindAttr = cuf::DataTransferKindAttr::get( + builder.getContext(), cuf::DataTransferKind::DeviceHost); + cuf::DataTransferOp::create(builder, loc, designateOp.getMemref(), temp, + /*shape=*/mlir::Value{}, transferKindAttr); + designateOp.getMemrefMutable().assign(temp); + builder.setInsertionPointAfter(elementalOp); + hlfir::AssignOp::create(builder, loc, elementalOp, lhs, + isWholeAllocatableAssignment, + keepLhsLengthInAllocatableAssignment); + return; + } auto transferKindAttr = cuf::DataTransferKindAttr::get( builder.getContext(), cuf::DataTransferKind::DeviceHost); cuf::DataTransferOp::create(builder, loc, rhsVal, lhsVal, shape, @@ -5039,7 +5063,9 @@ private: hlfir::Entity rhs = evaluateRhs(localStmtCtx); hlfir::Entity lhs = evaluateLhs(localStmtCtx); if (isCUDATransfer && !hasCUDAImplicitTransfer) - genCUDADataTransfer(builder, loc, assign, lhs, rhs); + genCUDADataTransfer(builder, loc, assign, lhs, rhs, + isWholeAllocatableAssignment, + keepLhsLengthInAllocatableAssignment); else hlfir::AssignOp::create(builder, loc, rhs, lhs, isWholeAllocatableAssignment, diff --git a/flang/lib/Lower/CUDA.cpp b/flang/lib/Lower/CUDA.cpp index f6d00780a16f..3e2804dace99 100644 --- a/flang/lib/Lower/CUDA.cpp +++ b/flang/lib/Lower/CUDA.cpp @@ -155,3 +155,12 @@ cuf::DataAttributeAttr Fortran::lower::translateSymbolCUFDataAttribute( Fortran::semantics::GetCUDADataAttr(&sym.GetUltimate()); return cuf::getDataAttribute(mlirContext, cudaAttr); } + +bool Fortran::lower::isTransferWithConversion(mlir::Value rhs) { + if (auto elOp = mlir::dyn_cast(rhs.getDefiningOp())) + if (llvm::hasSingleElement(elOp.getBody()->getOps()) && + llvm::hasSingleElement(elOp.getBody()->getOps()) == 1 && + llvm::hasSingleElement(elOp.getBody()->getOps()) == 1) + return true; + return false; +} diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf index 8f8bd9b67737..aef926b09a1e 100644 --- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf +++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf @@ -474,3 +474,51 @@ end ! CHECK: cuf.data_transfer %{{.*}} to %{{.*}} {transfer_kind = #cuf.cuda_transfer} : !fir.ref>>>, !fir.box> ! CHECK: hlfir.assign %{{.*}} to %{{.*}} : f64, !fir.ref ! CHECK: fir.freemem %{{.*}} : !fir.heap> + +subroutine sub26(i, j, k) + integer :: i, j, k + real(2), dimension(i,j,k), device :: d + real(4), dimension(i,j,k) :: hd + + hd = d +end subroutine + +! CHECK-LABEL: func.func @_QPsub26 +! CHECK: %[[ALLOC_D:.*]] = cuf.alloc !fir.array, %{{.*}}, %{{.*}}, %{{.*}} : index, index, index {bindc_name = "d", data_attr = #cuf.cuda, uniq_name = "_QFsub26Ed"} -> !fir.ref> +! CHECK: %[[D:.*]]:2 = hlfir.declare %[[ALLOC_D]](%{{.*}}) {data_attr = #cuf.cuda, uniq_name = "_QFsub26Ed"} : (!fir.ref>, !fir.shape<3>) -> (!fir.box>, !fir.ref>) +! CHECK: %[[HD:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub26Ehd"} : (!fir.ref>, !fir.shape<3>) -> (!fir.box>, !fir.ref>) +! CHECK: %[[ALLOC:.*]] = fir.allocmem !fir.array, %8, %13, %18 {bindc_name = ".tmp", uniq_name = ""} +! CHECK: %[[TEMP:.*]]:2 = hlfir.declare %[[ALLOC]](%{{.*}}) {uniq_name = ".tmp"} : (!fir.heap>, !fir.shape<3>) -> (!fir.box>, !fir.heap>) +! CHECK: cuf.data_transfer %[[D]]#0 to %[[TEMP]]#0 {transfer_kind = #cuf.cuda_transfer} : !fir.box>, !fir.box> +! CHECK: %[[ELE:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<3>) -> !hlfir.expr { +! CHECK: ^bb0(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index): +! CHECK: %[[DESIGNATE:.*]] = hlfir.designate %[[TEMP]]#0 (%{{.*}}, %{{.*}}, %{{.*}}) : (!fir.box>, index, index, index) -> !fir.ref +! CHECK: %[[LOAD:.*]] = fir.load %[[DESIGNATE]] : !fir.ref +! CHECK: %[[CONV:.*]] = fir.convert %[[LOAD]] : (f16) -> f32 +! CHECK: hlfir.yield_element %[[CONV]] : f32 +! CHECK: } +! CHECK: hlfir.assign %[[ELE]] to %[[HD]]#0 : !hlfir.expr, !fir.box> + +subroutine sub27() + real(2), dimension(10, 20, 30), device :: d + real(4), dimension(10, 20, 30) :: hd + + hd = d +end subroutine + +! CHECK-LABEL: func.func @_QPsub27() +! CHECK: %[[ALLOC_D:.*]] = cuf.alloc !fir.array<10x20x30xf16> {bindc_name = "d", data_attr = #cuf.cuda, uniq_name = "_QFsub27Ed"} -> !fir.ref> +! CHECK: %[[D:.*]]:2 = hlfir.declare %[[ALLOC_D]](%{{.*}}) {data_attr = #cuf.cuda, uniq_name = "_QFsub27Ed"} : (!fir.ref>, !fir.shape<3>) -> (!fir.ref>, !fir.ref>) +! CHECK: %[[ALLOC_HD:.*]] = fir.alloca !fir.array<10x20x30xf32> {bindc_name = "hd", uniq_name = "_QFsub27Ehd"} +! CHECK: %[[HD:.*]]:2 = hlfir.declare %[[ALLOC_HD]](%{{.*}}) {uniq_name = "_QFsub27Ehd"} : (!fir.ref>, !fir.shape<3>) -> (!fir.ref>, !fir.ref>) +! CHECK: %[[ALLOC_TEMP:.*]] = fir.allocmem !fir.array<10x20x30xf16> {bindc_name = ".tmp", uniq_name = ""} +! CHECK: %[[TEMP:.*]]:2 = hlfir.declare %[[ALLOC_TEMP]](%{{.*}}) {uniq_name = ".tmp"} : (!fir.heap>, !fir.shape<3>) -> (!fir.heap>, !fir.heap>) +! CHECK: cuf.data_transfer %[[D]]#0 to %[[TEMP]]#0 {transfer_kind = #cuf.cuda_transfer} : !fir.ref>, !fir.heap> +! CHECK: %[[ELE:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<3>) -> !hlfir.expr<10x20x30xf32> { +! CHECK: ^bb0(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index): +! CHECK: %[[DESIGNATE:.*]] = hlfir.designate %[[TEMP]]#0 (%{{.*}}, %{{.*}}, %{{.*}}) : (!fir.heap>, index, index, index) -> !fir.ref +! CHECK: %[[LOAD:.*]] = fir.load %[[DESIGNATE]] : !fir.ref +! CHECK: %[[CONV:.*]] = fir.convert %[[LOAD]] : (f16) -> f32 +! CHECK: hlfir.yield_element %[[CONV]] : f32 +! CHECK: } +! CHECKL: hlfir.assign %[[ELE]] to %[[HD]]#0 : !hlfir.expr<10x20x30xf32>, !fir.ref>