diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 1f2d47261112..135d1e4007d4 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -226,23 +226,6 @@ class ROCDL_SpecialIdRegisterOp : }]; } -// TODO(krzysz00): This should be a lowering pattern, not an op. -class ROCDL_DimGetterFunctionOp traits = []> : - ROCDL_Op, - Results<(outs LLVM_Type:$res)>, Arguments<(ins OptionalAttr:$range)> { - string llvmBuilder = "$res = createDimGetterFunctionCall(builder, op, \"" - # device_function # "\", " # parameter # ");"; - let assemblyFormat = "(`range` $range^)? attr-dict `:` type($res)"; - - // Temporaly builder until Nvidia ops also support range attributes. - let builders = [ - OpBuilder<(ins "Type":$resultType), [{ - build($_builder, $_state, resultType, ::mlir::LLVM::ConstantRangeAttr{}); - }]> - ]; -} - //===----------------------------------------------------------------------===// // ROCDL vector types definitions //===----------------------------------------------------------------------===// @@ -451,28 +434,6 @@ def ROCDL_ClusterWorkgroupIdZOp : ROCDL_SpecialIdRegisterOp<"cluster.workgroup.i def ROCDL_WaveId : ROCDL_SpecialIdRegisterOp<"wave.id">; def ROCDL_WavefrontSizeOp : ROCDL_SpecialIdRegisterOp<"wavefrontsize">; -//===----------------------------------------------------------------------===// -// Thread range and Block range -//===----------------------------------------------------------------------===// - -def ROCDL_BlockDimXOp : ROCDL_DimGetterFunctionOp<"workgroup.dim.x", - "__ockl_get_local_size", 0>; - -def ROCDL_BlockDimYOp : ROCDL_DimGetterFunctionOp<"workgroup.dim.y", - "__ockl_get_local_size", 1>; - -def ROCDL_BlockDimZOp : ROCDL_DimGetterFunctionOp<"workgroup.dim.z", - "__ockl_get_local_size", 2>; - -def ROCDL_GridDimXOp : ROCDL_DimGetterFunctionOp<"grid.dim.x", - "__ockl_get_num_groups", 0>; - -def ROCDL_GridDimYOp : ROCDL_DimGetterFunctionOp<"grid.dim.y", - "__ockl_get_num_groups", 1>; - -def ROCDL_GridDimZOp : ROCDL_DimGetterFunctionOp<"grid.dim.z", - "__ockl_get_num_groups", 2>; - //===----------------------------------------------------------------------===// // Synchronization primitives //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt index ce914c0ea3dd..31ac47ff3554 100644 --- a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt +++ b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt @@ -18,6 +18,7 @@ add_mlir_conversion_library(MLIRGPUToGPURuntimeTransforms AttrToSPIRVConverter.cpp GPUToLLVMConversion.cpp GPUOpsLowering.cpp + IndexIntrinsicsOpLowering.cpp DEPENDS MLIRConversionPassIncGen diff --git a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.cpp b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.cpp new file mode 100644 index 000000000000..ad3ae74d0c68 --- /dev/null +++ b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.cpp @@ -0,0 +1,82 @@ +//===- IndexIntrinsicsOpLowering.cpp - GPU Index Op Lowering --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "IndexIntrinsicsOpLowering.h" + +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/IR/BuiltinAttributes.h" + +using namespace mlir; +using namespace mlir::gpu::index_lowering; + +LLVM::ConstantRangeAttr mlir::gpu::index_lowering::getIndexOpRange( + Operation *op, gpu::Dimension dim, std::optional opUpperBound, + IndexKind indexKind, IntrType intrType, unsigned bitWidth) { + // Order of priority for bounds: + // 1. The upper_bound attribute + // 2. Inherent attributes on a surrounding gpu.func + // 3. Discardable attributes on a surrounding function of any kind + // The below code handles these in reverse order so that more important + // sources overwrite less important ones. + DenseI32ArrayAttr funcBounds = nullptr; + if (auto funcOp = op->getParentOfType()) { + switch (indexKind) { + case IndexKind::Block: { + auto blockHelper = + gpu::GPUDialect::KnownBlockSizeAttrHelper(op->getContext()); + if (blockHelper.isAttrPresent(funcOp)) + funcBounds = blockHelper.getAttr(funcOp); + break; + } + case IndexKind::Grid: { + auto gridHelper = + gpu::GPUDialect::KnownGridSizeAttrHelper(op->getContext()); + if (gridHelper.isAttrPresent(funcOp)) + funcBounds = gridHelper.getAttr(funcOp); + break; + } + case IndexKind::Cluster: { + auto clusterHelper = + gpu::GPUDialect::KnownClusterSizeAttrHelper(op->getContext()); + if (clusterHelper.isAttrPresent(funcOp)) + funcBounds = clusterHelper.getAttr(funcOp); + break; + } + case IndexKind::Other: + break; + } + } + if (auto gpuFunc = op->getParentOfType()) { + switch (indexKind) { + case IndexKind::Block: + funcBounds = gpuFunc.getKnownBlockSizeAttr(); + break; + case IndexKind::Grid: + funcBounds = gpuFunc.getKnownGridSizeAttr(); + break; + case IndexKind::Cluster: + funcBounds = gpuFunc.getKnownClusterSizeAttr(); + break; + case IndexKind::Other: + break; + } + } + std::optional upperBound; + if (funcBounds) + upperBound = funcBounds.asArrayRef()[static_cast(dim)]; + if (opUpperBound) + upperBound = *opUpperBound; + + if (!upperBound || intrType == IntrType::None) + return nullptr; + + uint32_t min = (intrType == IntrType::Dim ? 1u : 0u); + uint32_t max = + llvm::SaturatingAdd(*upperBound, (intrType == IntrType::Id ? 0u : 1u)); + return LLVM::ConstantRangeAttr::get(op->getContext(), bitWidth, min, max); +} diff --git a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h index ae0239132e7d..186823e1b40c 100644 --- a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h +++ b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h @@ -1,4 +1,4 @@ -//===- IndexIntrinsicsOpLowering.h - GPU IndexOps Lowering class *- C++ -*-===// +//===- IndexIntrinsicsOpLowering.h - GPU Index Op Lowering ------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -12,7 +12,6 @@ #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/IR/BuiltinAttributes.h" -#include namespace mlir { namespace gpu { @@ -24,6 +23,16 @@ enum class IntrType : uint32_t { Dim = 2, }; +/// Returns a ConstantRangeAttr for a GPU index op, or nullptr if no bounds +/// are found. `bitWidth` controls the width of the returned range. +/// Checks the provided upper_bound from the op (highest priority), inherent +/// attrs on enclosing `gpu.func`s, and discardable attributes on other +/// enclosing function ops (lowest priority). +LLVM::ConstantRangeAttr getIndexOpRange(Operation *op, gpu::Dimension dim, + std::optional opUpperBound, + IndexKind indexKind, IntrType intrType, + unsigned bitWidth); + // Rewriting that replaces Op with XOp, YOp, or ZOp depending on the dimension // that Op operates on. Op is assumed to return an `index` value and // XOp, YOp and ZOp are assumed to return an `llvm.i32` value. Depending on @@ -54,7 +63,7 @@ public: LogicalResult matchAndRewrite(Op op, typename Op::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { - auto loc = op->getLoc(); + Location loc = op->getLoc(); MLIRContext *context = rewriter.getContext(); Operation *newOp; switch (op.getDimension()) { @@ -69,70 +78,13 @@ public: break; } - // Order of priority for bounds: - // 1. The upper_bound attribute - // 2. Inherent attributes on a surrounding gpu.func - // 3. Discardable attributes on a surrounding function of any kind - // The below code handles these in reverse order so that more important - // sources overwrite less important ones. - DenseI32ArrayAttr funcBounds = nullptr; - if (auto funcOp = op->template getParentOfType()) { - switch (indexKind) { - case IndexKind::Block: { - auto blockHelper = - gpu::GPUDialect::KnownBlockSizeAttrHelper(op.getContext()); - if (blockHelper.isAttrPresent(funcOp)) - funcBounds = blockHelper.getAttr(funcOp); - break; - } - case IndexKind::Grid: { - auto gridHelper = - gpu::GPUDialect::KnownGridSizeAttrHelper(op.getContext()); - if (gridHelper.isAttrPresent(funcOp)) - funcBounds = gridHelper.getAttr(funcOp); - break; - } - case IndexKind::Cluster: { - auto clusterHelper = - gpu::GPUDialect::KnownClusterSizeAttrHelper(op.getContext()); - if (clusterHelper.isAttrPresent(funcOp)) - funcBounds = clusterHelper.getAttr(funcOp); - break; - } - case IndexKind::Other: - break; - } - } - if (auto gpuFunc = op->template getParentOfType()) { - switch (indexKind) { - case IndexKind::Block: - funcBounds = gpuFunc.getKnownBlockSizeAttr(); - break; - case IndexKind::Grid: - funcBounds = gpuFunc.getKnownGridSizeAttr(); - break; - case IndexKind::Cluster: - funcBounds = gpuFunc.getKnownClusterSizeAttr(); - break; - case IndexKind::Other: - break; - } - } - std::optional upperBound; - if (funcBounds) - upperBound = - funcBounds.asArrayRef()[static_cast(op.getDimension())]; - if (auto opBound = op.getUpperBound()) - upperBound = opBound->getZExtValue(); + std::optional opBound; + if (auto bound = op.getUpperBound()) + opBound = static_cast(bound->getZExtValue()); + if (auto range = getIndexOpRange(op, op.getDimension(), opBound, indexKind, + intrType, /*bitWidth=*/32)) + newOp->setAttr("range", range); - if (upperBound && intrType != IntrType::None) { - int32_t min = (intrType == IntrType::Dim ? 1 : 0); - int32_t max = *upperBound == std::numeric_limits::max() - ? *upperBound - : *upperBound + (intrType == IntrType::Id ? 0 : 1); - newOp->setAttr("range", LLVM::ConstantRangeAttr::get( - rewriter.getContext(), 32, min, max)); - } if (indexBitwidth > 32) { newOp = LLVM::SExtOp::create(rewriter, loc, IntegerType::get(context, indexBitwidth), diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp index 65353fedc9c4..b1a4627977f8 100644 --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -102,6 +102,64 @@ static Value getLaneId(RewriterBase &rewriter, Location loc) { return laneId; } +/// Maximum number of threads per block dimension on AMD GPUs. +static constexpr int64_t kMaxThreadsPerBlockDim = 1024; + +/// Emits a call to an OCKL block/grid size function corresponding to +/// `indexKind` with argument `dim`, querying for upper bounds in the context +/// surrounding `contextOp` as a fallback for an unknown/unavailable +/// `opUpperBound`. +static Value getOcklDim(RewriterBase &rewriter, + gpu::index_lowering::IndexKind indexKind, + gpu::Dimension dim, Operation *contextOp, + std::optional opUpperBound) { + Location loc = contextOp->getLoc(); + MLIRContext *context = contextOp->getContext(); + + auto i32Ty = IntegerType::get(context, 32); + auto i64Ty = IntegerType::get(context, 64); + + int32_t dimParam = static_cast(dim); + + StringRef functionName; + switch (indexKind) { + case gpu::index_lowering::IndexKind::Block: + functionName = "__ockl_get_local_size"; + break; + case gpu::index_lowering::IndexKind::Grid: + functionName = "__ockl_get_num_groups"; + break; + case gpu::index_lowering::IndexKind::Cluster: + case gpu::index_lowering::IndexKind::Other: + llvm_unreachable("Not valid index kinds for ockl lookup"); + } + + // Declare the ockl function: i64 @functionName(i32). + auto fnType = LLVM::LLVMFunctionType::get(i64Ty, {i32Ty}); + Operation *moduleOp = contextOp->getParentWithTrait(); + LLVM::LLVMFuncOp funcOp = + getOrDefineFunction(moduleOp, loc, rewriter, functionName, fnType); + + // Create the call. + Value dimConst = LLVM::ConstantOp::create(rewriter, loc, i32Ty, dimParam); + auto callOp = + LLVM::CallOp::create(rewriter, loc, funcOp, ValueRange{dimConst}); + + // Set range attribute on the call result if bounds are available. + auto range = gpu::index_lowering::getIndexOpRange( + contextOp, dim, opUpperBound, indexKind, + gpu::index_lowering::IntrType::Dim, /*bitWidth=*/64); + // Fall back to the hardware limit for block dimensions. + if (!range && indexKind == gpu::index_lowering::IndexKind::Block) + range = LLVM::ConstantRangeAttr::get(context, APInt(64, 1), + APInt(64, kMaxThreadsPerBlockDim + 1)); + if (range) { + callOp.setResAttrsAttr(rewriter.getArrayAttr(rewriter.getDictionaryAttr( + rewriter.getNamedAttr(LLVM::LLVMDialect::getRangeAttrName(), range)))); + } + return callOp.getResult(); +} + static constexpr StringLiteral amdgcnDataLayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:" @@ -110,6 +168,36 @@ static constexpr StringLiteral amdgcnDataLayout = "64-S32-A5-G1-ni:7:8:9"; namespace { + +/// Lowers gpu.block_dim / gpu.grid_dim to direct __ockl_get_local_size / +/// __ockl_get_num_groups function calls. +template +struct GPUDimOpToOcklCall final : ConvertOpToLLVMPattern { + GPUDimOpToOcklCall(const LLVMTypeConverter &converter, + gpu::index_lowering::IndexKind indexKind) + : ConvertOpToLLVMPattern(converter), indexKind(indexKind) {} + + LogicalResult + matchAndRewrite(OpTy op, typename OpTy::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Location loc = op.getLoc(); + + std::optional opUpperBound; + if (auto bound = op.getUpperBound()) + opUpperBound = static_cast(bound->getZExtValue()); + + Value ocklCall = + getOcklDim(rewriter, indexKind, op.getDimension(), op, opUpperBound); + Value result = truncOrExtToLLVMType(rewriter, loc, ocklCall, + *this->getTypeConverter()); + rewriter.replaceOp(op, result); + return success(); + } + +private: + const gpu::index_lowering::IndexKind indexKind; +}; + struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern { using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; @@ -197,16 +285,37 @@ struct GPUSubgroupIdOpToROCDL : ConvertOpToLLVMPattern { // For older architectures, compute: // subgroup_id = linearized_thread_id / subgroup_size // where linearized_thread_id = tid.x + dim.x * (tid.y + dim.y * tid.z) - Value tidX = ROCDL::ThreadIdXOp::create(rewriter, loc, int32Type); - Value tidY = ROCDL::ThreadIdYOp::create(rewriter, loc, int32Type); - Value tidZ = ROCDL::ThreadIdZOp::create(rewriter, loc, int32Type); - Value dimX = ROCDL::BlockDimXOp::create(rewriter, loc, int32Type); - Value dimY = ROCDL::BlockDimYOp::create(rewriter, loc, int32Type); + auto tidX = ROCDL::ThreadIdXOp::create(rewriter, loc, int32Type); + auto tidY = ROCDL::ThreadIdYOp::create(rewriter, loc, int32Type); + auto tidZ = ROCDL::ThreadIdZOp::create(rewriter, loc, int32Type); + auto setBoundFromContext = [&](Operation *tidOp, gpu::Dimension dim) { + if (LLVM::ConstantRangeAttr range = + gpu::index_lowering::getIndexOpRange( + op, dim, std::nullopt, + gpu::index_lowering::IndexKind::Block, + gpu::index_lowering::IntrType::Id, 32)) + tidOp->setAttr("range", range); + }; + setBoundFromContext(tidX, gpu::Dimension::x); + setBoundFromContext(tidY, gpu::Dimension::y); + setBoundFromContext(tidZ, gpu::Dimension::z); + + auto flags = + LLVM::IntegerOverflowFlags::nsw | LLVM::IntegerOverflowFlags::nuw; + + auto getBlockDim = [&](gpu::Dimension dim) { + Value dim64 = + getOcklDim(rewriter, gpu::index_lowering::IndexKind::Block, dim, op, + std::nullopt); + Value dimTrunc = + LLVM::TruncOp::create(rewriter, loc, int32Type, dim64, flags); + return dimTrunc; + }; + Value dimX = getBlockDim(gpu::Dimension::x); + Value dimY = getBlockDim(gpu::Dimension::y); // linearized = tid.x + dim.x * (tid.y + dim.y * tid.z) // Thread IDs and dimensions are non-negative and small, so use nuw+nsw. - auto flags = - LLVM::IntegerOverflowFlags::nsw | LLVM::IntegerOverflowFlags::nuw; Value dimYxTidZ = LLVM::MulOp::create(rewriter, loc, int32Type, dimY, tidZ, flags); Value tidYPlusDimYxTidZ = @@ -626,13 +735,9 @@ void mlir::populateGpuToROCDLConversionPatterns( patterns.add>( converter, IndexKind::Grid, IntrType::Id); - patterns.add< - gpu::index_lowering::OpLowering>( - converter, IndexKind::Block, IntrType::Dim); - patterns.add>( - converter, IndexKind::Grid, IntrType::Dim); + patterns.add>(converter, + IndexKind::Block); + patterns.add>(converter, IndexKind::Grid); patterns.add(converter); patterns.add( converter, diff --git a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp index 8142347d80cb..e1168e75f10d 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp @@ -17,7 +17,6 @@ #include "mlir/IR/Operation.h" #include "mlir/Target/LLVMIR/ModuleTranslation.h" -#include "llvm/IR/ConstantRange.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/Support/raw_ostream.h" @@ -26,32 +25,6 @@ using namespace mlir; using namespace mlir::LLVM; using mlir::LLVM::detail::createIntrinsicCall; -// Create a call to ROCm-Device-Library function that returns an ID. -// This is intended to specifically call device functions that fetch things like -// block or grid dimensions, and so is limited to functions that take one -// integer parameter. -static llvm::Value *createDimGetterFunctionCall(llvm::IRBuilderBase &builder, - Operation *op, StringRef fnName, - int parameter) { - llvm::Module *module = builder.GetInsertBlock()->getModule(); - llvm::FunctionType *functionType = llvm::FunctionType::get( - llvm::Type::getInt64Ty(module->getContext()), // return type. - llvm::Type::getInt32Ty(module->getContext()), // parameter type. - false); // no variadic arguments. - llvm::Function *fn = dyn_cast( - module->getOrInsertFunction(fnName, functionType).getCallee()); - llvm::Value *fnOp0 = llvm::ConstantInt::get( - llvm::Type::getInt32Ty(module->getContext()), parameter); - auto *call = builder.CreateCall(fn, ArrayRef(fnOp0)); - if (auto rangeAttr = op->getAttrOfType("range")) { - // Zero-extend to 64 bits because the GPU dialect uses 32-bit bounds but - // these ockl functions are defined to be 64-bits - call->addRangeRetAttr(llvm::ConstantRange(rangeAttr.getLower().zext(64), - rangeAttr.getUpper().zext(64))); - } - return call; -} - namespace { /// Implementation of the dialect interface that converts operations belonging /// to the ROCDL dialect to LLVM IR. diff --git a/mlir/test/Conversion/GPUCommon/lower-global-id.mlir b/mlir/test/Conversion/GPUCommon/lower-global-id.mlir index b0274e0f9f29..94b9f9005276 100644 --- a/mlir/test/Conversion/GPUCommon/lower-global-id.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-global-id.mlir @@ -11,9 +11,8 @@ gpu.module @kernel { // ROCDL-LABEL: llvm.func @gpu_global_id() -> i64 { // ROCDL: %[[WORKGROUP_0:.*]] = rocdl.workgroup.id.x : i32 // ROCDL: %[[SEXT_0:.*]] = llvm.sext %[[WORKGROUP_0]] : i32 to i64 -// ROCDL: %[[WORKGROUP_1:.*]] = rocdl.workgroup.dim.x : i32 -// ROCDL: %[[SEXT_1:.*]] = llvm.sext %[[WORKGROUP_1]] : i32 to i64 -// ROCDL: %[[MUL_0:.*]] = llvm.mul %[[SEXT_0]], %[[SEXT_1]] : i64 +// ROCDL: %[[DIM64:.*]] = llvm.call @__ockl_get_local_size(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range}) +// ROCDL: %[[MUL_0:.*]] = llvm.mul %[[SEXT_0]], %[[DIM64]] : i64 // ROCDL: %[[WORKITEM_0:.*]] = rocdl.workitem.id.x : i32 // ROCDL: %[[SEXT_2:.*]] = llvm.sext %[[WORKITEM_0]] : i32 to i64 // ROCDL: %[[ADD_0:.*]] = llvm.add %[[SEXT_2]], %[[MUL_0]] : i64 diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir index 030eb0e5eb18..b44216aab3bf 100644 --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir @@ -4,36 +4,66 @@ gpu.module @test_module { // CHECK-LABEL: func @subgroup_id() func.func @subgroup_id() -> index { - // GFX12: rocdl.wave.id : i32 - // GFX12: llvm.sext %{{.*}} : i32 to i64 + // GFX12: %[[WAVEID:.+]] = rocdl.wave.id : i32 + // GFX12: llvm.sext %[[WAVEID]] : i32 to i64 - // GFX9-DAG: rocdl.workitem.id.x : i32 - // GFX9-DAG: rocdl.workitem.id.y : i32 - // GFX9-DAG: rocdl.workitem.id.z : i32 - // GFX9-DAG: rocdl.workgroup.dim.x : i32 - // GFX9-DAG: rocdl.workgroup.dim.y : i32 - // GFX9-DAG: llvm.mul %{{.*}}, %{{.*}} overflow - // GFX9-DAG: llvm.add %{{.*}}, %{{.*}} overflow - // GFX9: rocdl.wavefrontsize : i32 - // GFX9: llvm.udiv - // GFX9: llvm.sext %{{.*}} : i32 to i64 + // GFX9-DAG: %[[IDX:.+]] = rocdl.workitem.id.x : i32 + // GFX9-DAG: %[[IDY:.+]] = rocdl.workitem.id.y : i32 + // GFX9-DAG: %[[IDZ:.+]] = rocdl.workitem.id.z : i32 + // GFX9-DAG: %[[DIMX_I64:.+]] = llvm.call @__ockl_get_local_size(%[[C0:.+]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range}) + // Yes, this is checking after the call that uses it. This prevents collisions with other 0s. + // GFX9-DAG: %[[C0]] = llvm.mlir.constant(0 : i32) : i32 + // GFX9-DAG: %[[DIMX:.+]] = llvm.trunc %[[DIMX_I64]] overflow : i64 to i32 + // GFX9-DAG: %[[DIMY_I64:.+]] = llvm.call @__ockl_get_local_size(%[[C1:.+]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range}) + // GFX9-DAG: %[[C1]] = llvm.mlir.constant(1 : i32) : i32 + // GFX9-DAG: %[[DIMY:.+]] = llvm.trunc %[[DIMY_I64]] overflow : i64 to i32 + // GFX9: %[[Z_DY:.+]] = llvm.mul %[[DIMY]], %[[IDZ]] overflow + // GFX9: %[[ZY:.+]] = llvm.add %[[IDY]], %[[Z_DY]] overflow + // GFX9: %[[YZ_DX:.+]] = llvm.mul %[[DIMX]], %[[ZY]] overflow + // GFX9: %[[ZYX:.+]] = llvm.add %[[IDX]], %[[YZ_DX]] overflow + // GFX9: %[[WAVESZ:.+]] = rocdl.wavefrontsize : i32 + // GFX9: %[[RES:.+]] = llvm.udiv %[[ZYX]], %[[WAVESZ]] + // GFX9: llvm.sext %[[RES]] : i32 to i64 %subgroupId = gpu.subgroup_id : index func.return %subgroupId : index } // CHECK-LABEL: func @subgroup_id_with_upper_bound() func.func @subgroup_id_with_upper_bound() -> index { - // GFX12: rocdl.wave.id range : i32 - // GFX12: llvm.sext %{{.*}} : i32 to i64 + // GFX12: %[[WAVEID:.+]] = rocdl.wave.id range : i32 + // GFX12: llvm.sext %[[WAVEID]] : i32 to i64 - // GFX9-DAG: rocdl.workitem.id.x : i32 - // GFX9-DAG: rocdl.workitem.id.y : i32 - // GFX9-DAG: rocdl.workitem.id.z : i32 - // GFX9-DAG: rocdl.workgroup.dim.x : i32 - // GFX9-DAG: rocdl.workgroup.dim.y : i32 - // GFX9: rocdl.wavefrontsize : i32 - // GFX9: llvm.udiv - // GFX9: llvm.sext %{{.*}} : i32 to i64 + // Minimal check to ensure we don't set any bounds based on the subgroup ID bound + // since we don't know which thread ID they go on to. + // GFX9: rocdl.workitem.id.x : i32 + // GFX9-DAG: llvm.call @__ockl_get_local_size({{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range}) + %subgroupId = gpu.subgroup_id upper_bound 4 : index + func.return %subgroupId : index +} + +// CHECK-LABEL: func @subgroup_id_with_workgroup_sizes() +func.func @subgroup_id_with_workgroup_sizes() -> index + attributes {gpu.known_block_size = array} { + // GFX12: %[[WAVEID:.+]] = rocdl.wave.id range : i32 + // GFX12: llvm.sext %[[WAVEID]] : i32 to i64 + + // GFX9-DAG: %[[IDX:.+]] = rocdl.workitem.id.x range : i32 + // GFX9-DAG: %[[IDY:.+]] = rocdl.workitem.id.y range : i32 + // GFX9-DAG: %[[IDZ:.+]] = rocdl.workitem.id.z range : i32 + // GFX9-DAG: %[[DIMX_I64:.+]] = llvm.call @__ockl_get_local_size(%[[C0:.+]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range}) + // Yes, this is checking after the call that uses it. This prevents collisions with other 0s. + // GFX9-DAG: %[[C0]] = llvm.mlir.constant(0 : i32) : i32 + // GFX9-DAG: %[[DIMX:.+]] = llvm.trunc %[[DIMX_I64]] overflow : i64 to i32 + // GFX9-DAG: %[[DIMY_I64:.+]] = llvm.call @__ockl_get_local_size(%[[C1:.+]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range}) + // GFX9-DAG: %[[C1]] = llvm.mlir.constant(1 : i32) : i32 + // GFX9-DAG: %[[DIMY:.+]] = llvm.trunc %[[DIMY_I64]] overflow : i64 to i32 + // GFX9: %[[Z_DY:.+]] = llvm.mul %[[DIMY]], %[[IDZ]] overflow + // GFX9: %[[ZY:.+]] = llvm.add %[[IDY]], %[[Z_DY]] overflow + // GFX9: %[[YZ_DX:.+]] = llvm.mul %[[DIMX]], %[[ZY]] overflow + // GFX9: %[[ZYX:.+]] = llvm.add %[[IDX]], %[[YZ_DX]] overflow + // GFX9: %[[WAVESZ:.+]] = rocdl.wavefrontsize : i32 + // GFX9: %[[RES:.+]] = llvm.udiv %[[ZYX]], %[[WAVESZ]] + // GFX9: llvm.sext %[[RES]] : i32 to i64 %subgroupId = gpu.subgroup_id upper_bound 4 : index func.return %subgroupId : index } diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir index 9c5c6c7cf9c8..3cc9ded6fe91 100755 --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir @@ -24,14 +24,14 @@ gpu.module @test_module { // CHECK: = llvm.sext %{{.*}} : i32 to i64 %tIdZ = gpu.thread_id z - // CHECK: rocdl.workgroup.dim.x : i32 - // CHECK: = llvm.sext %{{.*}} : i32 to i64 + // CHECK-DAG: %[[BD_C0:.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK-DAG: %[[BD_C1:.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: %[[BD_C2:.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK-DAG: llvm.call @__ockl_get_local_size(%[[BD_C0]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range}) %bDimX = gpu.block_dim x - // CHECK: rocdl.workgroup.dim.y : i32 - // CHECK: = llvm.sext %{{.*}} : i32 to i64 + // CHECK-DAG: llvm.call @__ockl_get_local_size(%[[BD_C1]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range}) %bDimY = gpu.block_dim y - // CHECK: rocdl.workgroup.dim.z : i32 - // CHECK: = llvm.sext %{{.*}} : i32 to i64 + // CHECK-DAG: llvm.call @__ockl_get_local_size(%[[BD_C2]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range}) %bDimZ = gpu.block_dim z // CHECK: rocdl.workgroup.id.x : i32 @@ -44,14 +44,14 @@ gpu.module @test_module { // CHECK: = llvm.sext %{{.*}} : i32 to i64 %bIdZ = gpu.block_id z - // CHECK: rocdl.grid.dim.x : i32 - // CHECK: = llvm.sext %{{.*}} : i32 to i64 + // CHECK-DAG: %[[GD_C0:.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK-DAG: %[[GD_C1:.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: %[[GD_C2:.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK-DAG: llvm.call @__ockl_get_num_groups(%[[GD_C0]]) : (i32) -> i64 %gDimX = gpu.grid_dim x - // CHECK: rocdl.grid.dim.y : i32 - // CHECK: = llvm.sext %{{.*}} : i32 to i64 + // CHECK-DAG: llvm.call @__ockl_get_num_groups(%[[GD_C1]]) : (i32) -> i64 %gDimY = gpu.grid_dim y - // CHECK: rocdl.grid.dim.z : i32 - // CHECK: = llvm.sext %{{.*}} : i32 to i64 + // CHECK-DAG: llvm.call @__ockl_get_num_groups(%[[GD_C2]]) : (i32) -> i64 %gDimZ = gpu.grid_dim z // CHECK: = rocdl.mbcnt.lo %{{.*}}, %{{.*}} {res_attrs = [{llvm.noundef, llvm.range = #llvm.constant_range}]} : (i32, i32) -> i32 @@ -100,13 +100,33 @@ gpu.module @test_module { // CHECK: rocdl.workgroup.id.z range : i32 %bIdZ = gpu.block_id z + // CHECK: llvm.call @__ockl_get_local_size(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range}) + %bDimX = gpu.block_dim x + // CHECK: llvm.call @__ockl_get_local_size(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range}) + %bDimY = gpu.block_dim y + // CHECK: llvm.call @__ockl_get_local_size(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range}) + %bDimZ = gpu.block_dim z + + // CHECK: llvm.call @__ockl_get_num_groups(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range}) + %gDimX = gpu.grid_dim x + // CHECK: llvm.call @__ockl_get_num_groups(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range}) + %gDimY = gpu.grid_dim y + // CHECK: llvm.call @__ockl_get_num_groups(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range}) + %gDimZ = gpu.grid_dim z + // "Usage" to make the ID calls not die %0 = arith.addi %tIdX, %tIdY : index %1 = arith.addi %0, %tIdZ : index %2 = arith.addi %1, %bIdX : index %3 = arith.addi %2, %bIdY : index %4 = arith.addi %3, %bIdZ : index - %5 = arith.index_cast %4 : index to i32 + %r0 = arith.addi %4, %bDimX : index + %r1 = arith.addi %r0, %bDimY : index + %r2 = arith.addi %r1, %bDimZ : index + %r3 = arith.addi %r2, %gDimX : index + %r4 = arith.addi %r3, %gDimY : index + %r5 = arith.addi %r4, %gDimZ : index + %5 = arith.index_cast %r5 : index to i32 memref.store %5, %place[] : memref gpu.return } @@ -796,7 +816,8 @@ gpu.module @test_module { func.func @gpu_dim_int_max_upper_bound() -> (index) { - // CHECK32: rocdl.workgroup.dim.x range : i32 + // CHECK32: llvm.call @__ockl_get_local_size(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range}) + // CHECK32: llvm.trunc %{{.*}} : i64 to i32 %bDimX = gpu.block_dim x upper_bound 2147483647 func.return %bDimX : index } diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir index 1a810dce706b..1d835b352e51 100644 --- a/mlir/test/Dialect/LLVMIR/rocdl.mlir +++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir @@ -26,20 +26,8 @@ func.func @rocdl_special_regs() -> i32 { %10 = rocdl.cluster.workgroup.id.y : i32 // CHECK: rocdl.cluster.workgroup.id.z : i32 %11 = rocdl.cluster.workgroup.id.z : i32 - // CHECK: rocdl.workgroup.dim.x : i32 - %12 = rocdl.workgroup.dim.x : i32 - // CHECK: rocdl.workgroup.dim.y : i32 - %13 = rocdl.workgroup.dim.y : i32 - // CHECK: rocdl.workgroup.dim.z : i32 - %14 = rocdl.workgroup.dim.z : i32 - // CHECK: rocdl.grid.dim.x : i32 - %15 = rocdl.grid.dim.x : i32 - // CHECK: rocdl.grid.dim.y : i32 - %16 = rocdl.grid.dim.y : i32 - // CHECK: rocdl.grid.dim.z : i32 - %17 = rocdl.grid.dim.z : i32 // CHECK: rocdl.wave.id : i32 - %18 = rocdl.wave.id : i32 + %12 = rocdl.wave.id : i32 llvm.return %0 : i32 } diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index 78a78c0bd1bb..d3ac38ecad32 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -27,37 +27,20 @@ llvm.func @rocdl_special_regs() -> i32 { // CHECK: call range(i32 0, 16) i32 @llvm.amdgcn.cluster.workgroup.id.z() %12 = rocdl.cluster.workgroup.id.z range : i32 - // CHECK: call i64 @__ockl_get_local_size(i32 0) - %13 = rocdl.workgroup.dim.x : i64 - // CHECK: call i64 @__ockl_get_local_size(i32 1) - %14 = rocdl.workgroup.dim.y : i64 - // CHECK: call i64 @__ockl_get_local_size(i32 2) - %15 = rocdl.workgroup.dim.z : i64 - - // CHECK: call i64 @__ockl_get_num_groups(i32 0) - %16 = rocdl.grid.dim.x : i64 - // CHECK: call i64 @__ockl_get_num_groups(i32 1) - %17 = rocdl.grid.dim.y : i64 - // CHECK: call i64 @__ockl_get_num_groups(i32 2) - %18 = rocdl.grid.dim.z : i64 - // CHECK: call range(i32 0, 64) i32 @llvm.amdgcn.workitem.id.x() - %19 = rocdl.workitem.id.x range : i32 - - // CHECK: call range(i64 1, 65) i64 @__ockl_get_local_size(i32 0) - %20 = rocdl.workgroup.dim.x range : i64 + %13 = rocdl.workitem.id.x range : i32 // CHECK: call i32 @llvm.amdgcn.wave.id() - %21 = rocdl.wave.id : i32 + %14 = rocdl.wave.id : i32 // CHECK: call range(i32 32, 65) i32 @llvm.amdgcn.wave.id() - %22 = rocdl.wave.id range : i32 + %15 = rocdl.wave.id range : i32 // CHECK: call i32 @llvm.amdgcn.wavefrontsize() - %23 = rocdl.wavefrontsize : i32 + %16 = rocdl.wavefrontsize : i32 // CHECK: call range(i32 32, 65) i32 @llvm.amdgcn.wavefrontsize() - %24 = rocdl.wavefrontsize range : i32 + %17 = rocdl.wavefrontsize range : i32 llvm.return %1 : i32 }