[mlir][ROCDL] Improve block/grid_dim handling, fix subgroup ID (#186235)

This began as me chasing down the fact that the subgroup_id pattern introduced lately was causing crashes in translation because of mismatches between the i64 type of the ockl functions being called and the i32 type they'd been assigned, and spilled out into a refactor of how we handle these dimension-lookup functions. This commit removes the {Block,Grid}Dim{X,Y,Z} ops from the rocdl dialect, since they were translating to library calls and not intrinsics, which meant they don't fit into the dialect. Therefore, we instead add a new pattern that rewrites block/grid dimensions to library calls. While I'm there, I go ahead and implement support for upper bounds on these dimensions accessors, adding a fallback bound of 1 <= size < 1024 to those calls. This also meant updating the lowering of subgroup_id to use that same call-generation logic. While I was here, I factored out the "get block/grid bounds from the context" logic from the index op lowering template into a separate function that would be reusable in the new patterns. This also makes the subgroup_id tests stricter. --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Tim Gymnich <tim@gymni.ch>
2026-03-13 09:57:52 -07:00 · 2026-03-13 09:57:52 -07:00 · ac7cf4c863
commit ac7cf4c863
parent 49250284cf
11 changed files with 315 additions and 220 deletions
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@ -226,23 +226,6 @@ class ROCDL_SpecialIdRegisterOp<string mnemonic> :
  }];
 }

-// TODO(krzysz00): This should be a lowering pattern, not an op.
-class ROCDL_DimGetterFunctionOp<string mnemonic, string device_function,
-                             int parameter, list<Trait> traits = []> :
-  ROCDL_Op<mnemonic, !listconcat(traits, [Pure])>,
-  Results<(outs LLVM_Type:$res)>, Arguments<(ins OptionalAttr<LLVM_ConstantRangeAttr>:$range)> {
-  string llvmBuilder = "$res = createDimGetterFunctionCall(builder, op, \""
-  # device_function # "\", " # parameter # ");";
-  let assemblyFormat = "(`range` $range^)? attr-dict `:` type($res)";
-
-  // Temporaly builder until Nvidia ops also support range attributes.
-  let builders = [
-    OpBuilder<(ins "Type":$resultType), [{
-      build($_builder, $_state, resultType, ::mlir::LLVM::ConstantRangeAttr{});
-    }]>
-  ];
-}
-
 //===----------------------------------------------------------------------===//
 // ROCDL vector types definitions
 //===----------------------------------------------------------------------===//
@ -451,28 +434,6 @@ def ROCDL_ClusterWorkgroupIdZOp : ROCDL_SpecialIdRegisterOp<"cluster.workgroup.i
 def ROCDL_WaveId : ROCDL_SpecialIdRegisterOp<"wave.id">;
 def ROCDL_WavefrontSizeOp : ROCDL_SpecialIdRegisterOp<"wavefrontsize">;

-//===----------------------------------------------------------------------===//
-// Thread range and Block range
-//===----------------------------------------------------------------------===//
-
-def ROCDL_BlockDimXOp : ROCDL_DimGetterFunctionOp<"workgroup.dim.x",
-                                               "__ockl_get_local_size", 0>;
-
-def ROCDL_BlockDimYOp : ROCDL_DimGetterFunctionOp<"workgroup.dim.y",
-                                               "__ockl_get_local_size", 1>;
-
-def ROCDL_BlockDimZOp : ROCDL_DimGetterFunctionOp<"workgroup.dim.z",
-                                               "__ockl_get_local_size", 2>;
-
-def ROCDL_GridDimXOp : ROCDL_DimGetterFunctionOp<"grid.dim.x",
-                                               "__ockl_get_num_groups", 0>;
-
-def ROCDL_GridDimYOp : ROCDL_DimGetterFunctionOp<"grid.dim.y",
-                                               "__ockl_get_num_groups", 1>;
-
-def ROCDL_GridDimZOp : ROCDL_DimGetterFunctionOp<"grid.dim.z",
-                                               "__ockl_get_num_groups", 2>;
-
 //===----------------------------------------------------------------------===//
 // Synchronization primitives
 //===----------------------------------------------------------------------===//
--- a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
@ -18,6 +18,7 @@ add_mlir_conversion_library(MLIRGPUToGPURuntimeTransforms
  AttrToSPIRVConverter.cpp
  GPUToLLVMConversion.cpp
  GPUOpsLowering.cpp
+  IndexIntrinsicsOpLowering.cpp

  DEPENDS
  MLIRConversionPassIncGen
--- a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.cpp
@ -0,0 +1,82 @@
+//===- IndexIntrinsicsOpLowering.cpp - GPU Index Op Lowering --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "IndexIntrinsicsOpLowering.h"
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/IR/BuiltinAttributes.h"
+
+using namespace mlir;
+using namespace mlir::gpu::index_lowering;
+
+LLVM::ConstantRangeAttr mlir::gpu::index_lowering::getIndexOpRange(
+    Operation *op, gpu::Dimension dim, std::optional<uint32_t> opUpperBound,
+    IndexKind indexKind, IntrType intrType, unsigned bitWidth) {
+  // Order of priority for bounds:
+  // 1. The upper_bound attribute
+  // 2. Inherent attributes on a surrounding gpu.func
+  // 3. Discardable attributes on a surrounding function of any kind
+  // The below code handles these in reverse order so that more important
+  // sources overwrite less important ones.
+  DenseI32ArrayAttr funcBounds = nullptr;
+  if (auto funcOp = op->getParentOfType<FunctionOpInterface>()) {
+    switch (indexKind) {
+    case IndexKind::Block: {
+      auto blockHelper =
+          gpu::GPUDialect::KnownBlockSizeAttrHelper(op->getContext());
+      if (blockHelper.isAttrPresent(funcOp))
+        funcBounds = blockHelper.getAttr(funcOp);
+      break;
+    }
+    case IndexKind::Grid: {
+      auto gridHelper =
+          gpu::GPUDialect::KnownGridSizeAttrHelper(op->getContext());
+      if (gridHelper.isAttrPresent(funcOp))
+        funcBounds = gridHelper.getAttr(funcOp);
+      break;
+    }
+    case IndexKind::Cluster: {
+      auto clusterHelper =
+          gpu::GPUDialect::KnownClusterSizeAttrHelper(op->getContext());
+      if (clusterHelper.isAttrPresent(funcOp))
+        funcBounds = clusterHelper.getAttr(funcOp);
+      break;
+    }
+    case IndexKind::Other:
+      break;
+    }
+  }
+  if (auto gpuFunc = op->getParentOfType<gpu::GPUFuncOp>()) {
+    switch (indexKind) {
+    case IndexKind::Block:
+      funcBounds = gpuFunc.getKnownBlockSizeAttr();
+      break;
+    case IndexKind::Grid:
+      funcBounds = gpuFunc.getKnownGridSizeAttr();
+      break;
+    case IndexKind::Cluster:
+      funcBounds = gpuFunc.getKnownClusterSizeAttr();
+      break;
+    case IndexKind::Other:
+      break;
+    }
+  }
+  std::optional<uint32_t> upperBound;
+  if (funcBounds)
+    upperBound = funcBounds.asArrayRef()[static_cast<uint32_t>(dim)];
+  if (opUpperBound)
+    upperBound = *opUpperBound;
+
+  if (!upperBound || intrType == IntrType::None)
+    return nullptr;
+
+  uint32_t min = (intrType == IntrType::Dim ? 1u : 0u);
+  uint32_t max =
+      llvm::SaturatingAdd(*upperBound, (intrType == IntrType::Id ? 0u : 1u));
+  return LLVM::ConstantRangeAttr::get(op->getContext(), bitWidth, min, max);
+}
--- a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
@ -1,4 +1,4 @@
-//===- IndexIntrinsicsOpLowering.h - GPU IndexOps Lowering class *- C++ -*-===//
+//===- IndexIntrinsicsOpLowering.h - GPU Index Op Lowering ------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@ -12,7 +12,6 @@
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/BuiltinAttributes.h"
-#include <limits>

 namespace mlir {
 namespace gpu {
@ -24,6 +23,16 @@ enum class IntrType : uint32_t {
  Dim = 2,
 };

+/// Returns a ConstantRangeAttr for a GPU index op, or nullptr if no bounds
+/// are found. `bitWidth` controls the width of the returned range.
+/// Checks the provided upper_bound from the op (highest priority), inherent
+/// attrs on enclosing `gpu.func`s, and discardable attributes on other
+/// enclosing function ops (lowest priority).
+LLVM::ConstantRangeAttr getIndexOpRange(Operation *op, gpu::Dimension dim,
+                                        std::optional<uint32_t> opUpperBound,
+                                        IndexKind indexKind, IntrType intrType,
+                                        unsigned bitWidth);
+
 // Rewriting that replaces Op with XOp, YOp, or ZOp depending on the dimension
 // that Op operates on.  Op is assumed to return an `index` value and
 // XOp, YOp and ZOp are assumed to return an `llvm.i32` value.  Depending on
@ -54,7 +63,7 @@ public:
  LogicalResult
  matchAndRewrite(Op op, typename Op::Adaptor adaptor,
                  ConversionPatternRewriter &rewriter) const override {
-    auto loc = op->getLoc();
+    Location loc = op->getLoc();
    MLIRContext *context = rewriter.getContext();
    Operation *newOp;
    switch (op.getDimension()) {
@ -69,70 +78,13 @@ public:
      break;
    }

-    // Order of priority for bounds:
-    // 1. The upper_bound attribute
-    // 2. Inherent attributes on a surrounding gpu.func
-    // 3. Discardable attributes on a surrounding function of any kind
-    // The below code handles these in reverse order so that more important
-    // sources overwrite less important ones.
-    DenseI32ArrayAttr funcBounds = nullptr;
-    if (auto funcOp = op->template getParentOfType<FunctionOpInterface>()) {
-      switch (indexKind) {
-      case IndexKind::Block: {
-        auto blockHelper =
-            gpu::GPUDialect::KnownBlockSizeAttrHelper(op.getContext());
-        if (blockHelper.isAttrPresent(funcOp))
-          funcBounds = blockHelper.getAttr(funcOp);
-        break;
-      }
-      case IndexKind::Grid: {
-        auto gridHelper =
-            gpu::GPUDialect::KnownGridSizeAttrHelper(op.getContext());
-        if (gridHelper.isAttrPresent(funcOp))
-          funcBounds = gridHelper.getAttr(funcOp);
-        break;
-      }
-      case IndexKind::Cluster: {
-        auto clusterHelper =
-            gpu::GPUDialect::KnownClusterSizeAttrHelper(op.getContext());
-        if (clusterHelper.isAttrPresent(funcOp))
-          funcBounds = clusterHelper.getAttr(funcOp);
-        break;
-      }
-      case IndexKind::Other:
-        break;
-      }
-    }
-    if (auto gpuFunc = op->template getParentOfType<gpu::GPUFuncOp>()) {
-      switch (indexKind) {
-      case IndexKind::Block:
-        funcBounds = gpuFunc.getKnownBlockSizeAttr();
-        break;
-      case IndexKind::Grid:
-        funcBounds = gpuFunc.getKnownGridSizeAttr();
-        break;
-      case IndexKind::Cluster:
-        funcBounds = gpuFunc.getKnownClusterSizeAttr();
-        break;
-      case IndexKind::Other:
-        break;
-      }
-    }
-    std::optional<int32_t> upperBound;
-    if (funcBounds)
-      upperBound =
-          funcBounds.asArrayRef()[static_cast<uint32_t>(op.getDimension())];
-    if (auto opBound = op.getUpperBound())
-      upperBound = opBound->getZExtValue();
+    std::optional<uint32_t> opBound;
+    if (auto bound = op.getUpperBound())
+      opBound = static_cast<uint32_t>(bound->getZExtValue());
+    if (auto range = getIndexOpRange(op, op.getDimension(), opBound, indexKind,
+                                     intrType, /*bitWidth=*/32))
+      newOp->setAttr("range", range);

-    if (upperBound && intrType != IntrType::None) {
-      int32_t min = (intrType == IntrType::Dim ? 1 : 0);
-      int32_t max = *upperBound == std::numeric_limits<int32_t>::max()
-                        ? *upperBound
-                        : *upperBound + (intrType == IntrType::Id ? 0 : 1);
-      newOp->setAttr("range", LLVM::ConstantRangeAttr::get(
-                                  rewriter.getContext(), 32, min, max));
-    }
    if (indexBitwidth > 32) {
      newOp = LLVM::SExtOp::create(rewriter, loc,
                                   IntegerType::get(context, indexBitwidth),
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@ -102,6 +102,64 @@ static Value getLaneId(RewriterBase &rewriter, Location loc) {
  return laneId;
 }

+/// Maximum number of threads per block dimension on AMD GPUs.
+static constexpr int64_t kMaxThreadsPerBlockDim = 1024;
+
+/// Emits a call to an OCKL block/grid size function corresponding to
+/// `indexKind` with argument `dim`, querying for upper bounds in the context
+/// surrounding `contextOp` as a fallback for an unknown/unavailable
+/// `opUpperBound`.
+static Value getOcklDim(RewriterBase &rewriter,
+                        gpu::index_lowering::IndexKind indexKind,
+                        gpu::Dimension dim, Operation *contextOp,
+                        std::optional<uint32_t> opUpperBound) {
+  Location loc = contextOp->getLoc();
+  MLIRContext *context = contextOp->getContext();
+
+  auto i32Ty = IntegerType::get(context, 32);
+  auto i64Ty = IntegerType::get(context, 64);
+
+  int32_t dimParam = static_cast<int32_t>(dim);
+
+  StringRef functionName;
+  switch (indexKind) {
+  case gpu::index_lowering::IndexKind::Block:
+    functionName = "__ockl_get_local_size";
+    break;
+  case gpu::index_lowering::IndexKind::Grid:
+    functionName = "__ockl_get_num_groups";
+    break;
+  case gpu::index_lowering::IndexKind::Cluster:
+  case gpu::index_lowering::IndexKind::Other:
+    llvm_unreachable("Not valid index kinds for ockl lookup");
+  }
+
+  // Declare the ockl function: i64 @functionName(i32).
+  auto fnType = LLVM::LLVMFunctionType::get(i64Ty, {i32Ty});
+  Operation *moduleOp = contextOp->getParentWithTrait<OpTrait::SymbolTable>();
+  LLVM::LLVMFuncOp funcOp =
+      getOrDefineFunction(moduleOp, loc, rewriter, functionName, fnType);
+
+  // Create the call.
+  Value dimConst = LLVM::ConstantOp::create(rewriter, loc, i32Ty, dimParam);
+  auto callOp =
+      LLVM::CallOp::create(rewriter, loc, funcOp, ValueRange{dimConst});
+
+  // Set range attribute on the call result if bounds are available.
+  auto range = gpu::index_lowering::getIndexOpRange(
+      contextOp, dim, opUpperBound, indexKind,
+      gpu::index_lowering::IntrType::Dim, /*bitWidth=*/64);
+  // Fall back to the hardware limit for block dimensions.
+  if (!range && indexKind == gpu::index_lowering::IndexKind::Block)
+    range = LLVM::ConstantRangeAttr::get(context, APInt(64, 1),
+                                         APInt(64, kMaxThreadsPerBlockDim + 1));
+  if (range) {
+    callOp.setResAttrsAttr(rewriter.getArrayAttr(rewriter.getDictionaryAttr(
+        rewriter.getNamedAttr(LLVM::LLVMDialect::getRangeAttrName(), range))));
+  }
+  return callOp.getResult();
+}
+
 static constexpr StringLiteral amdgcnDataLayout =
    "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
    "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:"
@ -110,6 +168,36 @@ static constexpr StringLiteral amdgcnDataLayout =
    "64-S32-A5-G1-ni:7:8:9";

 namespace {
+
+/// Lowers gpu.block_dim / gpu.grid_dim to direct __ockl_get_local_size /
+/// __ockl_get_num_groups function calls.
+template <typename OpTy>
+struct GPUDimOpToOcklCall final : ConvertOpToLLVMPattern<OpTy> {
+  GPUDimOpToOcklCall(const LLVMTypeConverter &converter,
+                     gpu::index_lowering::IndexKind indexKind)
+      : ConvertOpToLLVMPattern<OpTy>(converter), indexKind(indexKind) {}
+
+  LogicalResult
+  matchAndRewrite(OpTy op, typename OpTy::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+
+    std::optional<uint32_t> opUpperBound;
+    if (auto bound = op.getUpperBound())
+      opUpperBound = static_cast<uint32_t>(bound->getZExtValue());
+
+    Value ocklCall =
+        getOcklDim(rewriter, indexKind, op.getDimension(), op, opUpperBound);
+    Value result = truncOrExtToLLVMType(rewriter, loc, ocklCall,
+                                        *this->getTypeConverter());
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+
+private:
+  const gpu::index_lowering::IndexKind indexKind;
+};
+
 struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
  using ConvertOpToLLVMPattern<gpu::LaneIdOp>::ConvertOpToLLVMPattern;

@ -197,16 +285,37 @@ struct GPUSubgroupIdOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupIdOp> {
      // For older architectures, compute:
      // subgroup_id = linearized_thread_id / subgroup_size
      // where linearized_thread_id = tid.x + dim.x * (tid.y + dim.y * tid.z)
-      Value tidX = ROCDL::ThreadIdXOp::create(rewriter, loc, int32Type);
-      Value tidY = ROCDL::ThreadIdYOp::create(rewriter, loc, int32Type);
-      Value tidZ = ROCDL::ThreadIdZOp::create(rewriter, loc, int32Type);
-      Value dimX = ROCDL::BlockDimXOp::create(rewriter, loc, int32Type);
-      Value dimY = ROCDL::BlockDimYOp::create(rewriter, loc, int32Type);
+      auto tidX = ROCDL::ThreadIdXOp::create(rewriter, loc, int32Type);
+      auto tidY = ROCDL::ThreadIdYOp::create(rewriter, loc, int32Type);
+      auto tidZ = ROCDL::ThreadIdZOp::create(rewriter, loc, int32Type);
+      auto setBoundFromContext = [&](Operation *tidOp, gpu::Dimension dim) {
+        if (LLVM::ConstantRangeAttr range =
+                gpu::index_lowering::getIndexOpRange(
+                    op, dim, std::nullopt,
+                    gpu::index_lowering::IndexKind::Block,
+                    gpu::index_lowering::IntrType::Id, 32))
+          tidOp->setAttr("range", range);
+      };
+      setBoundFromContext(tidX, gpu::Dimension::x);
+      setBoundFromContext(tidY, gpu::Dimension::y);
+      setBoundFromContext(tidZ, gpu::Dimension::z);
+
+      auto flags =
+          LLVM::IntegerOverflowFlags::nsw | LLVM::IntegerOverflowFlags::nuw;
+
+      auto getBlockDim = [&](gpu::Dimension dim) {
+        Value dim64 =
+            getOcklDim(rewriter, gpu::index_lowering::IndexKind::Block, dim, op,
+                       std::nullopt);
+        Value dimTrunc =
+            LLVM::TruncOp::create(rewriter, loc, int32Type, dim64, flags);
+        return dimTrunc;
+      };
+      Value dimX = getBlockDim(gpu::Dimension::x);
+      Value dimY = getBlockDim(gpu::Dimension::y);

      // linearized = tid.x + dim.x * (tid.y + dim.y * tid.z)
      // Thread IDs and dimensions are non-negative and small, so use nuw+nsw.
-      auto flags =
-          LLVM::IntegerOverflowFlags::nsw | LLVM::IntegerOverflowFlags::nuw;
      Value dimYxTidZ =
          LLVM::MulOp::create(rewriter, loc, int32Type, dimY, tidZ, flags);
      Value tidYPlusDimYxTidZ =
@ -626,13 +735,9 @@ void mlir::populateGpuToROCDLConversionPatterns(
  patterns.add<gpu::index_lowering::OpLowering<
      gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
      converter, IndexKind::Grid, IntrType::Id);
-  patterns.add<
-      gpu::index_lowering::OpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
-                                      ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>>(
-      converter, IndexKind::Block, IntrType::Dim);
-  patterns.add<gpu::index_lowering::OpLowering<
-      gpu::GridDimOp, ROCDL::GridDimXOp, ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(
-      converter, IndexKind::Grid, IntrType::Dim);
+  patterns.add<GPUDimOpToOcklCall<gpu::BlockDimOp>>(converter,
+                                                    IndexKind::Block);
+  patterns.add<GPUDimOpToOcklCall<gpu::GridDimOp>>(converter, IndexKind::Grid);
  patterns.add<GPUReturnOpLowering>(converter);
  patterns.add<GPUFuncOpLowering>(
      converter,
--- a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
@ -17,7 +17,6 @@
 #include "mlir/IR/Operation.h"
 #include "mlir/Target/LLVMIR/ModuleTranslation.h"

-#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/Support/raw_ostream.h"
@ -26,32 +25,6 @@ using namespace mlir;
 using namespace mlir::LLVM;
 using mlir::LLVM::detail::createIntrinsicCall;

-// Create a call to ROCm-Device-Library function that returns an ID.
-// This is intended to specifically call device functions that fetch things like
-// block or grid dimensions, and so is limited to functions that take one
-// integer parameter.
-static llvm::Value *createDimGetterFunctionCall(llvm::IRBuilderBase &builder,
-                                                Operation *op, StringRef fnName,
-                                                int parameter) {
-  llvm::Module *module = builder.GetInsertBlock()->getModule();
-  llvm::FunctionType *functionType = llvm::FunctionType::get(
-      llvm::Type::getInt64Ty(module->getContext()), // return type.
-      llvm::Type::getInt32Ty(module->getContext()), // parameter type.
-      false);                                       // no variadic arguments.
-  llvm::Function *fn = dyn_cast<llvm::Function>(
-      module->getOrInsertFunction(fnName, functionType).getCallee());
-  llvm::Value *fnOp0 = llvm::ConstantInt::get(
-      llvm::Type::getInt32Ty(module->getContext()), parameter);
-  auto *call = builder.CreateCall(fn, ArrayRef<llvm::Value *>(fnOp0));
-  if (auto rangeAttr = op->getAttrOfType<LLVM::ConstantRangeAttr>("range")) {
-    // Zero-extend to 64 bits because the GPU dialect uses 32-bit bounds but
-    // these ockl functions are defined to be 64-bits
-    call->addRangeRetAttr(llvm::ConstantRange(rangeAttr.getLower().zext(64),
-                                              rangeAttr.getUpper().zext(64)));
-  }
-  return call;
-}
-
 namespace {
 /// Implementation of the dialect interface that converts operations belonging
 /// to the ROCDL dialect to LLVM IR.
--- a/mlir/test/Conversion/GPUCommon/lower-global-id.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-global-id.mlir
@ -11,9 +11,8 @@ gpu.module @kernel {
 // ROCDL-LABEL:   llvm.func @gpu_global_id() -> i64 {
 // ROCDL:           %[[WORKGROUP_0:.*]] = rocdl.workgroup.id.x : i32
 // ROCDL:           %[[SEXT_0:.*]] = llvm.sext %[[WORKGROUP_0]] : i32 to i64
-// ROCDL:           %[[WORKGROUP_1:.*]] = rocdl.workgroup.dim.x : i32
-// ROCDL:           %[[SEXT_1:.*]] = llvm.sext %[[WORKGROUP_1]] : i32 to i64
-// ROCDL:           %[[MUL_0:.*]] = llvm.mul %[[SEXT_0]], %[[SEXT_1]] : i64
+// ROCDL:           %[[DIM64:.*]] = llvm.call @__ockl_get_local_size(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 1025>})
+// ROCDL:           %[[MUL_0:.*]] = llvm.mul %[[SEXT_0]], %[[DIM64]] : i64
 // ROCDL:           %[[WORKITEM_0:.*]] = rocdl.workitem.id.x : i32
 // ROCDL:           %[[SEXT_2:.*]] = llvm.sext %[[WORKITEM_0]] : i32 to i64
 // ROCDL:           %[[ADD_0:.*]] = llvm.add %[[SEXT_2]], %[[MUL_0]] : i64
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir
@ -4,36 +4,66 @@
 gpu.module @test_module {
 // CHECK-LABEL: func @subgroup_id()
 func.func @subgroup_id() -> index {
-  // GFX12: rocdl.wave.id : i32
-  // GFX12: llvm.sext %{{.*}} : i32 to i64
+  // GFX12: %[[WAVEID:.+]] = rocdl.wave.id : i32
+  // GFX12: llvm.sext %[[WAVEID]] : i32 to i64

-  // GFX9-DAG: rocdl.workitem.id.x : i32
-  // GFX9-DAG: rocdl.workitem.id.y : i32
-  // GFX9-DAG: rocdl.workitem.id.z : i32
-  // GFX9-DAG: rocdl.workgroup.dim.x : i32
-  // GFX9-DAG: rocdl.workgroup.dim.y : i32
-  // GFX9-DAG: llvm.mul %{{.*}}, %{{.*}} overflow<nsw, nuw>
-  // GFX9-DAG: llvm.add %{{.*}}, %{{.*}} overflow<nsw, nuw>
-  // GFX9: rocdl.wavefrontsize : i32
-  // GFX9: llvm.udiv
-  // GFX9: llvm.sext %{{.*}} : i32 to i64
+  // GFX9-DAG: %[[IDX:.+]] = rocdl.workitem.id.x : i32
+  // GFX9-DAG: %[[IDY:.+]] = rocdl.workitem.id.y : i32
+  // GFX9-DAG: %[[IDZ:.+]] = rocdl.workitem.id.z : i32
+  // GFX9-DAG: %[[DIMX_I64:.+]] = llvm.call @__ockl_get_local_size(%[[C0:.+]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 1025>})
+  // Yes, this is checking after the call that uses it. This prevents collisions with other 0s.
+  // GFX9-DAG: %[[C0]] = llvm.mlir.constant(0 : i32) : i32
+  // GFX9-DAG: %[[DIMX:.+]] = llvm.trunc %[[DIMX_I64]] overflow<nsw, nuw> : i64 to i32
+  // GFX9-DAG: %[[DIMY_I64:.+]] = llvm.call @__ockl_get_local_size(%[[C1:.+]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 1025>})
+  // GFX9-DAG: %[[C1]] = llvm.mlir.constant(1 : i32) : i32
+  // GFX9-DAG: %[[DIMY:.+]] = llvm.trunc %[[DIMY_I64]] overflow<nsw, nuw> : i64 to i32
+  // GFX9: %[[Z_DY:.+]] = llvm.mul %[[DIMY]], %[[IDZ]] overflow<nsw, nuw>
+  // GFX9: %[[ZY:.+]] = llvm.add %[[IDY]], %[[Z_DY]] overflow<nsw, nuw>
+  // GFX9: %[[YZ_DX:.+]] = llvm.mul %[[DIMX]], %[[ZY]] overflow<nsw, nuw>
+  // GFX9: %[[ZYX:.+]] = llvm.add %[[IDX]], %[[YZ_DX]] overflow<nsw, nuw>
+  // GFX9: %[[WAVESZ:.+]] = rocdl.wavefrontsize : i32
+  // GFX9: %[[RES:.+]] = llvm.udiv %[[ZYX]], %[[WAVESZ]]
+  // GFX9: llvm.sext %[[RES]] : i32 to i64
  %subgroupId = gpu.subgroup_id : index
  func.return %subgroupId : index
 }

 // CHECK-LABEL: func @subgroup_id_with_upper_bound()
 func.func @subgroup_id_with_upper_bound() -> index {
-  // GFX12: rocdl.wave.id range <i32, 0, 4> : i32
-  // GFX12: llvm.sext %{{.*}} : i32 to i64
+  // GFX12: %[[WAVEID:.+]] = rocdl.wave.id range <i32, 0, 4> : i32
+  // GFX12: llvm.sext %[[WAVEID]] : i32 to i64

-  // GFX9-DAG: rocdl.workitem.id.x : i32
-  // GFX9-DAG: rocdl.workitem.id.y : i32
-  // GFX9-DAG: rocdl.workitem.id.z : i32
-  // GFX9-DAG: rocdl.workgroup.dim.x : i32
-  // GFX9-DAG: rocdl.workgroup.dim.y : i32
-  // GFX9: rocdl.wavefrontsize : i32
-  // GFX9: llvm.udiv
-  // GFX9: llvm.sext %{{.*}} : i32 to i64
+  // Minimal check to ensure we don't set any bounds based on the subgroup ID bound
+  // since we don't know which thread ID they go on to.
+  // GFX9: rocdl.workitem.id.x : i32
+  // GFX9-DAG: llvm.call @__ockl_get_local_size({{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 1025>})
+  %subgroupId = gpu.subgroup_id upper_bound 4 : index
+  func.return %subgroupId : index
+}
+
+// CHECK-LABEL: func @subgroup_id_with_workgroup_sizes()
+func.func @subgroup_id_with_workgroup_sizes() -> index
+    attributes {gpu.known_block_size = array<i32: 64, 4, 1>} {
+  // GFX12: %[[WAVEID:.+]] = rocdl.wave.id range <i32, 0, 4> : i32
+  // GFX12: llvm.sext %[[WAVEID]] : i32 to i64
+
+  // GFX9-DAG: %[[IDX:.+]] = rocdl.workitem.id.x range <i32, 0, 64> : i32
+  // GFX9-DAG: %[[IDY:.+]] = rocdl.workitem.id.y range <i32, 0, 4> : i32
+  // GFX9-DAG: %[[IDZ:.+]] = rocdl.workitem.id.z range <i32, 0, 1> : i32
+  // GFX9-DAG: %[[DIMX_I64:.+]] = llvm.call @__ockl_get_local_size(%[[C0:.+]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 65>})
+  // Yes, this is checking after the call that uses it. This prevents collisions with other 0s.
+  // GFX9-DAG: %[[C0]] = llvm.mlir.constant(0 : i32) : i32
+  // GFX9-DAG: %[[DIMX:.+]] = llvm.trunc %[[DIMX_I64]] overflow<nsw, nuw> : i64 to i32
+  // GFX9-DAG: %[[DIMY_I64:.+]] = llvm.call @__ockl_get_local_size(%[[C1:.+]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 5>})
+  // GFX9-DAG: %[[C1]] = llvm.mlir.constant(1 : i32) : i32
+  // GFX9-DAG: %[[DIMY:.+]] = llvm.trunc %[[DIMY_I64]] overflow<nsw, nuw> : i64 to i32
+  // GFX9: %[[Z_DY:.+]] = llvm.mul %[[DIMY]], %[[IDZ]] overflow<nsw, nuw>
+  // GFX9: %[[ZY:.+]] = llvm.add %[[IDY]], %[[Z_DY]] overflow<nsw, nuw>
+  // GFX9: %[[YZ_DX:.+]] = llvm.mul %[[DIMX]], %[[ZY]] overflow<nsw, nuw>
+  // GFX9: %[[ZYX:.+]] = llvm.add %[[IDX]], %[[YZ_DX]] overflow<nsw, nuw>
+  // GFX9: %[[WAVESZ:.+]] = rocdl.wavefrontsize : i32
+  // GFX9: %[[RES:.+]] = llvm.udiv %[[ZYX]], %[[WAVESZ]]
+  // GFX9: llvm.sext %[[RES]] : i32 to i64
  %subgroupId = gpu.subgroup_id upper_bound 4 : index
  func.return %subgroupId : index
 }
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@ -24,14 +24,14 @@ gpu.module @test_module {
    // CHECK: = llvm.sext %{{.*}} : i32 to i64
    %tIdZ = gpu.thread_id z

-    // CHECK: rocdl.workgroup.dim.x : i32
-    // CHECK: = llvm.sext %{{.*}} : i32 to i64
+    // CHECK-DAG: %[[BD_C0:.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK-DAG: %[[BD_C1:.*]] = llvm.mlir.constant(1 : i32) : i32
+    // CHECK-DAG: %[[BD_C2:.*]] = llvm.mlir.constant(2 : i32) : i32
+    // CHECK-DAG: llvm.call @__ockl_get_local_size(%[[BD_C0]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 1025>})
    %bDimX = gpu.block_dim x
-    // CHECK: rocdl.workgroup.dim.y : i32
-    // CHECK: = llvm.sext %{{.*}} : i32 to i64
+    // CHECK-DAG: llvm.call @__ockl_get_local_size(%[[BD_C1]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 1025>})
    %bDimY = gpu.block_dim y
-    // CHECK: rocdl.workgroup.dim.z : i32
-    // CHECK: = llvm.sext %{{.*}} : i32 to i64
+    // CHECK-DAG: llvm.call @__ockl_get_local_size(%[[BD_C2]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 1025>})
    %bDimZ = gpu.block_dim z

    // CHECK: rocdl.workgroup.id.x : i32
@ -44,14 +44,14 @@ gpu.module @test_module {
    // CHECK: = llvm.sext %{{.*}} : i32 to i64
    %bIdZ = gpu.block_id z

-    // CHECK: rocdl.grid.dim.x : i32
-    // CHECK: = llvm.sext %{{.*}} : i32 to i64
+    // CHECK-DAG: %[[GD_C0:.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK-DAG: %[[GD_C1:.*]] = llvm.mlir.constant(1 : i32) : i32
+    // CHECK-DAG: %[[GD_C2:.*]] = llvm.mlir.constant(2 : i32) : i32
+    // CHECK-DAG: llvm.call @__ockl_get_num_groups(%[[GD_C0]]) : (i32) -> i64
    %gDimX = gpu.grid_dim x
-    // CHECK: rocdl.grid.dim.y : i32
-    // CHECK: = llvm.sext %{{.*}} : i32 to i64
+    // CHECK-DAG: llvm.call @__ockl_get_num_groups(%[[GD_C1]]) : (i32) -> i64
    %gDimY = gpu.grid_dim y
-    // CHECK: rocdl.grid.dim.z : i32
-    // CHECK: = llvm.sext %{{.*}} : i32 to i64
+    // CHECK-DAG: llvm.call @__ockl_get_num_groups(%[[GD_C2]]) : (i32) -> i64
    %gDimZ = gpu.grid_dim z

    // CHECK: = rocdl.mbcnt.lo %{{.*}}, %{{.*}} {res_attrs = [{llvm.noundef, llvm.range = #llvm.constant_range<i32, 0, 32>}]} : (i32, i32) -> i32
@ -100,13 +100,33 @@ gpu.module @test_module {
    // CHECK: rocdl.workgroup.id.z range <i32, 0, 28> : i32
    %bIdZ = gpu.block_id z

+    // CHECK: llvm.call @__ockl_get_local_size(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 9>})
+    %bDimX = gpu.block_dim x
+    // CHECK: llvm.call @__ockl_get_local_size(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 13>})
+    %bDimY = gpu.block_dim y
+    // CHECK: llvm.call @__ockl_get_local_size(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 17>})
+    %bDimZ = gpu.block_dim z
+
+    // CHECK: llvm.call @__ockl_get_num_groups(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 21>})
+    %gDimX = gpu.grid_dim x
+    // CHECK: llvm.call @__ockl_get_num_groups(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 25>})
+    %gDimY = gpu.grid_dim y
+    // CHECK: llvm.call @__ockl_get_num_groups(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 29>})
+    %gDimZ = gpu.grid_dim z
+
    // "Usage" to make the ID calls not die
    %0 = arith.addi %tIdX, %tIdY : index
    %1 = arith.addi %0, %tIdZ : index
    %2 = arith.addi %1, %bIdX : index
    %3 = arith.addi %2, %bIdY : index
    %4 = arith.addi %3, %bIdZ : index
-    %5 = arith.index_cast %4 : index to i32
+    %r0 = arith.addi %4, %bDimX : index
+    %r1 = arith.addi %r0, %bDimY : index
+    %r2 = arith.addi %r1, %bDimZ : index
+    %r3 = arith.addi %r2, %gDimX : index
+    %r4 = arith.addi %r3, %gDimY : index
+    %r5 = arith.addi %r4, %gDimZ : index
+    %5 = arith.index_cast %r5 : index to i32
    memref.store %5, %place[] : memref<i32>
    gpu.return
  }
@ -796,7 +816,8 @@ gpu.module @test_module {
  func.func @gpu_dim_int_max_upper_bound()
      -> (index) {

-    // CHECK32: rocdl.workgroup.dim.x range <i32, 1, 2147483647> : i32
+    // CHECK32: llvm.call @__ockl_get_local_size(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 2147483648>})
+    // CHECK32: llvm.trunc %{{.*}} : i64 to i32
    %bDimX = gpu.block_dim x upper_bound 2147483647
    func.return %bDimX : index
  }
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@ -26,20 +26,8 @@ func.func @rocdl_special_regs() -> i32 {
  %10 = rocdl.cluster.workgroup.id.y : i32
  // CHECK: rocdl.cluster.workgroup.id.z : i32
  %11 = rocdl.cluster.workgroup.id.z : i32
-  // CHECK: rocdl.workgroup.dim.x : i32
-  %12 = rocdl.workgroup.dim.x : i32
-  // CHECK: rocdl.workgroup.dim.y : i32
-  %13 = rocdl.workgroup.dim.y : i32
-  // CHECK: rocdl.workgroup.dim.z : i32
-  %14 = rocdl.workgroup.dim.z : i32
-  // CHECK: rocdl.grid.dim.x : i32
-  %15 = rocdl.grid.dim.x : i32
-  // CHECK: rocdl.grid.dim.y : i32
-  %16 = rocdl.grid.dim.y : i32
-  // CHECK: rocdl.grid.dim.z : i32
-  %17 = rocdl.grid.dim.z : i32
  // CHECK: rocdl.wave.id : i32
-  %18 = rocdl.wave.id : i32
+  %12 = rocdl.wave.id : i32
  llvm.return %0 : i32
 }

--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@ -27,37 +27,20 @@ llvm.func @rocdl_special_regs() -> i32 {
  // CHECK: call range(i32 0, 16) i32 @llvm.amdgcn.cluster.workgroup.id.z()
  %12 = rocdl.cluster.workgroup.id.z range <i32, 0, 16> : i32

-  // CHECK: call i64 @__ockl_get_local_size(i32 0)
-  %13 = rocdl.workgroup.dim.x : i64
-  // CHECK: call i64 @__ockl_get_local_size(i32 1)
-  %14 = rocdl.workgroup.dim.y : i64
-  // CHECK: call i64 @__ockl_get_local_size(i32 2)
-  %15 = rocdl.workgroup.dim.z : i64
-
-  // CHECK: call i64 @__ockl_get_num_groups(i32 0)
-  %16 = rocdl.grid.dim.x : i64
-  // CHECK: call i64 @__ockl_get_num_groups(i32 1)
-  %17 = rocdl.grid.dim.y : i64
-  // CHECK: call i64 @__ockl_get_num_groups(i32 2)
-  %18 = rocdl.grid.dim.z : i64
-
  // CHECK: call range(i32 0, 64) i32 @llvm.amdgcn.workitem.id.x()
-  %19 = rocdl.workitem.id.x range <i32, 0, 64> : i32
-
-  // CHECK: call range(i64 1, 65) i64 @__ockl_get_local_size(i32 0)
-  %20 = rocdl.workgroup.dim.x range <i32, 1, 65> : i64
+  %13 = rocdl.workitem.id.x range <i32, 0, 64> : i32

  // CHECK: call i32 @llvm.amdgcn.wave.id()
-  %21 = rocdl.wave.id : i32
+  %14 = rocdl.wave.id : i32

  // CHECK: call range(i32 32, 65) i32 @llvm.amdgcn.wave.id()
-  %22 = rocdl.wave.id range <i32, 32, 65> : i32
+  %15 = rocdl.wave.id range <i32, 32, 65> : i32

  // CHECK: call i32 @llvm.amdgcn.wavefrontsize()
-  %23 = rocdl.wavefrontsize : i32
+  %16 = rocdl.wavefrontsize : i32

  // CHECK: call range(i32 32, 65) i32 @llvm.amdgcn.wavefrontsize()
-  %24 = rocdl.wavefrontsize range <i32, 32, 65> : i32
+  %17 = rocdl.wavefrontsize range <i32, 32, 65> : i32

  llvm.return %1 : i32
 }