From 150145486e82c8da581f94cdceceb2652d644825 Mon Sep 17 00:00:00 2001 From: Sang Ik Lee Date: Thu, 23 Oct 2025 08:32:53 -0700 Subject: [PATCH] [MLIR][GPU] Generalize gpu.printf op lowering to LLVM call pattern. (#164297) Existing pattern for lowering gpu.printf op to LLVM call uses fixed function name and calling convention. Those two should be exposed as pass option to allow supporting Intel Compute Runtime for GPU. Also adds gpu.printf op pattern to GPU to LLVMSPV pass. It may appear out of place, but integration test is added to XeVM integration test as that is the current best folder for testing with Intel Compute Runtime. Test should be moved in the future if a better test folder is added. --- .../Conversion/GPUCommon/GPUOpsLowering.cpp | 6 ++-- .../lib/Conversion/GPUCommon/GPUOpsLowering.h | 21 ++++++++++--- .../Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp | 5 +++- mlir/test/Conversion/GPUToLLVMSPV/printf.mlir | 16 ++++++++++ .../Dialect/XeVM/GPU/gpu_printf.mlir | 30 +++++++++++++++++++ 5 files changed, 71 insertions(+), 7 deletions(-) create mode 100644 mlir/test/Conversion/GPUToLLVMSPV/printf.mlir create mode 100644 mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp index 2285d2695db4..eb662a1b056d 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp @@ -507,7 +507,8 @@ LogicalResult GPUPrintfOpToLLVMCallLowering::matchAndRewrite( LLVM::LLVMFunctionType::get(rewriter.getI32Type(), {ptrType}, /*isVarArg=*/true); LLVM::LLVMFuncOp printfDecl = - getOrDefineFunction(moduleOp, loc, rewriter, "printf", printfType); + getOrDefineFunction(moduleOp, loc, rewriter, funcName, printfType); + printfDecl.setCConv(callingConvention); // Create the global op or find an existing one. LLVM::GlobalOp global = getOrCreateStringConstant( @@ -530,7 +531,8 @@ LogicalResult GPUPrintfOpToLLVMCallLowering::matchAndRewrite( printfArgs.push_back(stringStart); printfArgs.append(argsRange.begin(), argsRange.end()); - LLVM::CallOp::create(rewriter, loc, printfDecl, printfArgs); + auto call = LLVM::CallOp::create(rewriter, loc, printfDecl, printfArgs); + call.setCConv(callingConvention); rewriter.eraseOp(gpuPrintfOp); return success(); } diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h index 66d3bb40a8f5..ec74787b2a8e 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h @@ -10,6 +10,7 @@ #include "mlir/Conversion/LLVMCommon/Pattern.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/LLVMIR/LLVMAttrs.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" namespace mlir { @@ -142,13 +143,23 @@ struct GPUPrintfOpToHIPLowering : public ConvertOpToLLVMPattern { /// This pass will add a declaration of printf() to the GPUModule if needed /// and separate out the format strings into global constants. For some /// runtimes, such as OpenCL on AMD, this is sufficient setup, as the compiler -/// will lower printf calls to appropriate device-side code +/// will lower printf calls to appropriate device-side code. +/// However not all backends use the same calling convention and function +/// naming. +/// For example, the LLVM SPIRV backend requires calling convention +/// LLVM::cconv::CConv::SPIR_FUNC and function name needs to be +/// mangled as "_Z6printfPU3AS2Kcz". +/// Default callingConvention is LLVM::cconv::CConv::C and +/// funcName is "printf" but they can be customized as needed. struct GPUPrintfOpToLLVMCallLowering : public ConvertOpToLLVMPattern { - GPUPrintfOpToLLVMCallLowering(const LLVMTypeConverter &converter, - int addressSpace = 0) + GPUPrintfOpToLLVMCallLowering( + const LLVMTypeConverter &converter, int addressSpace = 0, + LLVM::cconv::CConv callingConvention = LLVM::cconv::CConv::C, + StringRef funcName = "printf") : ConvertOpToLLVMPattern(converter), - addressSpace(addressSpace) {} + addressSpace(addressSpace), callingConvention(callingConvention), + funcName(funcName) {} LogicalResult matchAndRewrite(gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor, @@ -156,6 +167,8 @@ struct GPUPrintfOpToLLVMCallLowering private: int addressSpace; + LLVM::cconv::CConv callingConvention; + StringRef funcName; }; /// Lowering of gpu.printf to a vprintf standard library. diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp index c2363a1a4029..25f1e1b184d6 100644 --- a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp +++ b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp @@ -470,10 +470,13 @@ struct GPUToLLVMSPVConversionPass final gpu::GPUFuncOp, gpu::GlobalIdOp, gpu::GridDimOp, gpu::LaneIdOp, gpu::NumSubgroupsOp, gpu::ReturnOp, gpu::ShuffleOp, gpu::SubgroupIdOp, gpu::SubgroupSizeOp, - gpu::ThreadIdOp>(); + gpu::ThreadIdOp, gpu::PrintfOp>(); populateGpuToLLVMSPVConversionPatterns(converter, patterns); populateGpuMemorySpaceAttributeConversions(converter); + patterns.add(converter, /*addressSpace=*/2, + LLVM::cconv::CConv::SPIR_FUNC, + "_Z6printfPU3AS2Kcz"); if (failed(applyPartialConversion(getOperation(), target, std::move(patterns)))) diff --git a/mlir/test/Conversion/GPUToLLVMSPV/printf.mlir b/mlir/test/Conversion/GPUToLLVMSPV/printf.mlir new file mode 100644 index 000000000000..74017e8354cf --- /dev/null +++ b/mlir/test/Conversion/GPUToLLVMSPV/printf.mlir @@ -0,0 +1,16 @@ +// RUN: mlir-opt %s -convert-gpu-to-llvm-spv | FileCheck %s + +gpu.module @test_module { + // CHECK: llvm.mlir.global internal constant @[[$PRINT_GLOBAL:[A-Za-z0-9_]+]]("Hello: %d\0A\00") {addr_space = 2 : i32} + // CHECK: llvm.func spir_funccc @_Z6printfPU3AS2Kcz(!llvm.ptr<2>, ...) -> i32 + // CHECK-LABEL: llvm.func spir_funccc @test_printf + // CHECK: (%[[ARG0:.*]]: i32) + gpu.func @test_printf(%arg0: i32) { + // CHECK: %[[IMM0:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL]] : !llvm.ptr<2> + // CHECK-NEXT: %[[IMM2:.*]] = llvm.getelementptr %[[IMM0]][0, 0] : (!llvm.ptr<2>) -> !llvm.ptr<2>, !llvm.array<11 x i8> + // CHECK-NEXT: %{{.*}} = llvm.call spir_funccc @_Z6printfPU3AS2Kcz(%[[IMM2]], %[[ARG0]]) vararg(!llvm.func, ...)>) : (!llvm.ptr<2>, i32) -> i32 + gpu.printf "Hello: %d\n", %arg0 : i32 + gpu.return + } +} + diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir new file mode 100644 index 000000000000..edf8775c7241 --- /dev/null +++ b/mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir @@ -0,0 +1,30 @@ +// RUN: mlir-opt %s \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \ +// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ +// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \ +// RUN: | mlir-runner \ +// RUN: --shared-libs=%mlir_sycl_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --shared-libs=%mlir_c_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +module @test attributes {gpu.container_module} { + gpu.module @test_module { + gpu.func @test_printf(%arg0: i32, %arg1: f32) kernel { + gpu.printf "Hello: %d\n", %arg0 : i32 + gpu.printf "Hello: %f\n", %arg1 : f32 + gpu.return + } + } + + func.func @main() attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %c11 = arith.constant 11 : i32 + %c4 = arith.constant 4.0 : f32 + // CHECK: Hello: 11 + // CHECK: Hello: 4.000000 + gpu.launch_func @test_module::@test_printf blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%c11 : i32, %c4 : f32) + return + } +}