[flang][cuda] Support data transfer with conversion (#153242)

When the rhs of the data transfer is from a different type, allocate a
new temp on the host and first transfer the rhs to it. Then, use the
elemental op created to do the conversion.
This commit is contained in:
Valentin Clement (バレンタイン クレメン) 2025-08-13 10:55:15 -07:00 committed by GitHub
parent 6ae6c4fa4b
commit a2899c457e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 87 additions and 2 deletions

View File

@ -62,6 +62,8 @@ cuf::DataAttributeAttr
translateSymbolCUFDataAttribute(mlir::MLIRContext *mlirContext,
const Fortran::semantics::Symbol &sym);
bool isTransferWithConversion(mlir::Value rhs);
} // end namespace Fortran::lower
#endif // FORTRAN_LOWER_CUDA_H

View File

@ -4827,7 +4827,9 @@ private:
void genCUDADataTransfer(fir::FirOpBuilder &builder, mlir::Location loc,
const Fortran::evaluate::Assignment &assign,
hlfir::Entity &lhs, hlfir::Entity &rhs) {
hlfir::Entity &lhs, hlfir::Entity &rhs,
bool isWholeAllocatableAssignment,
bool keepLhsLengthInAllocatableAssignment) {
bool lhsIsDevice = Fortran::evaluate::HasCUDADeviceAttrs(assign.lhs);
bool rhsIsDevice = Fortran::evaluate::HasCUDADeviceAttrs(assign.rhs);
@ -4892,6 +4894,28 @@ private:
// host = device
if (!lhsIsDevice && rhsIsDevice) {
if (Fortran::lower::isTransferWithConversion(rhs)) {
mlir::OpBuilder::InsertionGuard insertionGuard(builder);
auto elementalOp =
mlir::dyn_cast<hlfir::ElementalOp>(rhs.getDefiningOp());
assert(elementalOp && "expect elemental op");
auto designateOp =
*elementalOp.getBody()->getOps<hlfir::DesignateOp>().begin();
builder.setInsertionPoint(elementalOp);
// Create a temp to transfer the rhs before applying the conversion.
hlfir::Entity entity{designateOp.getMemref()};
auto [temp, cleanup] = hlfir::createTempFromMold(loc, builder, entity);
auto transferKindAttr = cuf::DataTransferKindAttr::get(
builder.getContext(), cuf::DataTransferKind::DeviceHost);
cuf::DataTransferOp::create(builder, loc, designateOp.getMemref(), temp,
/*shape=*/mlir::Value{}, transferKindAttr);
designateOp.getMemrefMutable().assign(temp);
builder.setInsertionPointAfter(elementalOp);
hlfir::AssignOp::create(builder, loc, elementalOp, lhs,
isWholeAllocatableAssignment,
keepLhsLengthInAllocatableAssignment);
return;
}
auto transferKindAttr = cuf::DataTransferKindAttr::get(
builder.getContext(), cuf::DataTransferKind::DeviceHost);
cuf::DataTransferOp::create(builder, loc, rhsVal, lhsVal, shape,
@ -5039,7 +5063,9 @@ private:
hlfir::Entity rhs = evaluateRhs(localStmtCtx);
hlfir::Entity lhs = evaluateLhs(localStmtCtx);
if (isCUDATransfer && !hasCUDAImplicitTransfer)
genCUDADataTransfer(builder, loc, assign, lhs, rhs);
genCUDADataTransfer(builder, loc, assign, lhs, rhs,
isWholeAllocatableAssignment,
keepLhsLengthInAllocatableAssignment);
else
hlfir::AssignOp::create(builder, loc, rhs, lhs,
isWholeAllocatableAssignment,

View File

@ -155,3 +155,12 @@ cuf::DataAttributeAttr Fortran::lower::translateSymbolCUFDataAttribute(
Fortran::semantics::GetCUDADataAttr(&sym.GetUltimate());
return cuf::getDataAttribute(mlirContext, cudaAttr);
}
bool Fortran::lower::isTransferWithConversion(mlir::Value rhs) {
if (auto elOp = mlir::dyn_cast<hlfir::ElementalOp>(rhs.getDefiningOp()))
if (llvm::hasSingleElement(elOp.getBody()->getOps<hlfir::DesignateOp>()) &&
llvm::hasSingleElement(elOp.getBody()->getOps<fir::LoadOp>()) == 1 &&
llvm::hasSingleElement(elOp.getBody()->getOps<fir::ConvertOp>()) == 1)
return true;
return false;
}

View File

@ -474,3 +474,51 @@ end
! CHECK: cuf.data_transfer %{{.*}} to %{{.*}} {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.box<!fir.array<?xf64>>
! CHECK: hlfir.assign %{{.*}} to %{{.*}} : f64, !fir.ref<f64>
! CHECK: fir.freemem %{{.*}} : !fir.heap<!fir.array<?xf64>>
subroutine sub26(i, j, k)
integer :: i, j, k
real(2), dimension(i,j,k), device :: d
real(4), dimension(i,j,k) :: hd
hd = d
end subroutine
! CHECK-LABEL: func.func @_QPsub26
! CHECK: %[[ALLOC_D:.*]] = cuf.alloc !fir.array<?x?x?xf16>, %{{.*}}, %{{.*}}, %{{.*}} : index, index, index {bindc_name = "d", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub26Ed"} -> !fir.ref<!fir.array<?x?x?xf16>>
! CHECK: %[[D:.*]]:2 = hlfir.declare %[[ALLOC_D]](%{{.*}}) {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub26Ed"} : (!fir.ref<!fir.array<?x?x?xf16>>, !fir.shape<3>) -> (!fir.box<!fir.array<?x?x?xf16>>, !fir.ref<!fir.array<?x?x?xf16>>)
! CHECK: %[[HD:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub26Ehd"} : (!fir.ref<!fir.array<?x?x?xf32>>, !fir.shape<3>) -> (!fir.box<!fir.array<?x?x?xf32>>, !fir.ref<!fir.array<?x?x?xf32>>)
! CHECK: %[[ALLOC:.*]] = fir.allocmem !fir.array<?x?x?xf16>, %8, %13, %18 {bindc_name = ".tmp", uniq_name = ""}
! CHECK: %[[TEMP:.*]]:2 = hlfir.declare %[[ALLOC]](%{{.*}}) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?x?x?xf16>>, !fir.shape<3>) -> (!fir.box<!fir.array<?x?x?xf16>>, !fir.heap<!fir.array<?x?x?xf16>>)
! CHECK: cuf.data_transfer %[[D]]#0 to %[[TEMP]]#0 {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.box<!fir.array<?x?x?xf16>>, !fir.box<!fir.array<?x?x?xf16>>
! CHECK: %[[ELE:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<3>) -> !hlfir.expr<?x?x?xf32> {
! CHECK: ^bb0(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index):
! CHECK: %[[DESIGNATE:.*]] = hlfir.designate %[[TEMP]]#0 (%{{.*}}, %{{.*}}, %{{.*}}) : (!fir.box<!fir.array<?x?x?xf16>>, index, index, index) -> !fir.ref<f16>
! CHECK: %[[LOAD:.*]] = fir.load %[[DESIGNATE]] : !fir.ref<f16>
! CHECK: %[[CONV:.*]] = fir.convert %[[LOAD]] : (f16) -> f32
! CHECK: hlfir.yield_element %[[CONV]] : f32
! CHECK: }
! CHECK: hlfir.assign %[[ELE]] to %[[HD]]#0 : !hlfir.expr<?x?x?xf32>, !fir.box<!fir.array<?x?x?xf32>>
subroutine sub27()
real(2), dimension(10, 20, 30), device :: d
real(4), dimension(10, 20, 30) :: hd
hd = d
end subroutine
! CHECK-LABEL: func.func @_QPsub27()
! CHECK: %[[ALLOC_D:.*]] = cuf.alloc !fir.array<10x20x30xf16> {bindc_name = "d", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub27Ed"} -> !fir.ref<!fir.array<10x20x30xf16>>
! CHECK: %[[D:.*]]:2 = hlfir.declare %[[ALLOC_D]](%{{.*}}) {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub27Ed"} : (!fir.ref<!fir.array<10x20x30xf16>>, !fir.shape<3>) -> (!fir.ref<!fir.array<10x20x30xf16>>, !fir.ref<!fir.array<10x20x30xf16>>)
! CHECK: %[[ALLOC_HD:.*]] = fir.alloca !fir.array<10x20x30xf32> {bindc_name = "hd", uniq_name = "_QFsub27Ehd"}
! CHECK: %[[HD:.*]]:2 = hlfir.declare %[[ALLOC_HD]](%{{.*}}) {uniq_name = "_QFsub27Ehd"} : (!fir.ref<!fir.array<10x20x30xf32>>, !fir.shape<3>) -> (!fir.ref<!fir.array<10x20x30xf32>>, !fir.ref<!fir.array<10x20x30xf32>>)
! CHECK: %[[ALLOC_TEMP:.*]] = fir.allocmem !fir.array<10x20x30xf16> {bindc_name = ".tmp", uniq_name = ""}
! CHECK: %[[TEMP:.*]]:2 = hlfir.declare %[[ALLOC_TEMP]](%{{.*}}) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<10x20x30xf16>>, !fir.shape<3>) -> (!fir.heap<!fir.array<10x20x30xf16>>, !fir.heap<!fir.array<10x20x30xf16>>)
! CHECK: cuf.data_transfer %[[D]]#0 to %[[TEMP]]#0 {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.ref<!fir.array<10x20x30xf16>>, !fir.heap<!fir.array<10x20x30xf16>>
! CHECK: %[[ELE:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<3>) -> !hlfir.expr<10x20x30xf32> {
! CHECK: ^bb0(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index):
! CHECK: %[[DESIGNATE:.*]] = hlfir.designate %[[TEMP]]#0 (%{{.*}}, %{{.*}}, %{{.*}}) : (!fir.heap<!fir.array<10x20x30xf16>>, index, index, index) -> !fir.ref<f16>
! CHECK: %[[LOAD:.*]] = fir.load %[[DESIGNATE]] : !fir.ref<f16>
! CHECK: %[[CONV:.*]] = fir.convert %[[LOAD]] : (f16) -> f32
! CHECK: hlfir.yield_element %[[CONV]] : f32
! CHECK: }
! CHECKL: hlfir.assign %[[ELE]] to %[[HD]]#0 : !hlfir.expr<10x20x30xf32>, !fir.ref<!fir.array<10x20x30xf32>>