diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 1f2d47261112..135d1e4007d4 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -226,23 +226,6 @@ class ROCDL_SpecialIdRegisterOp<string mnemonic> :
   }];
 }
 
-// TODO(krzysz00): This should be a lowering pattern, not an op.
-class ROCDL_DimGetterFunctionOp<string mnemonic, string device_function,
-                             int parameter, list<Trait> traits = []> :
-  ROCDL_Op<mnemonic, !listconcat(traits, [Pure])>,
-  Results<(outs LLVM_Type:$res)>, Arguments<(ins OptionalAttr<LLVM_ConstantRangeAttr>:$range)> {
-  string llvmBuilder = "$res = createDimGetterFunctionCall(builder, op, \""
-  # device_function # "\", " # parameter # ");";
-  let assemblyFormat = "(`range` $range^)? attr-dict `:` type($res)";
-
-  // Temporaly builder until Nvidia ops also support range attributes.
-  let builders = [
-    OpBuilder<(ins "Type":$resultType), [{
-      build($_builder, $_state, resultType, ::mlir::LLVM::ConstantRangeAttr{});
-    }]>
-  ];
-}
-
 //===----------------------------------------------------------------------===//
 // ROCDL vector types definitions
 //===----------------------------------------------------------------------===//
@@ -451,28 +434,6 @@ def ROCDL_ClusterWorkgroupIdZOp : ROCDL_SpecialIdRegisterOp<"cluster.workgroup.i
 def ROCDL_WaveId : ROCDL_SpecialIdRegisterOp<"wave.id">;
 def ROCDL_WavefrontSizeOp : ROCDL_SpecialIdRegisterOp<"wavefrontsize">;
 
-//===----------------------------------------------------------------------===//
-// Thread range and Block range
-//===----------------------------------------------------------------------===//
-
-def ROCDL_BlockDimXOp : ROCDL_DimGetterFunctionOp<"workgroup.dim.x",
-                                               "__ockl_get_local_size", 0>;
-
-def ROCDL_BlockDimYOp : ROCDL_DimGetterFunctionOp<"workgroup.dim.y",
-                                               "__ockl_get_local_size", 1>;
-
-def ROCDL_BlockDimZOp : ROCDL_DimGetterFunctionOp<"workgroup.dim.z",
-                                               "__ockl_get_local_size", 2>;
-
-def ROCDL_GridDimXOp : ROCDL_DimGetterFunctionOp<"grid.dim.x",
-                                               "__ockl_get_num_groups", 0>;
-
-def ROCDL_GridDimYOp : ROCDL_DimGetterFunctionOp<"grid.dim.y",
-                                               "__ockl_get_num_groups", 1>;
-
-def ROCDL_GridDimZOp : ROCDL_DimGetterFunctionOp<"grid.dim.z",
-                                               "__ockl_get_num_groups", 2>;
-
 //===----------------------------------------------------------------------===//
 // Synchronization primitives
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
index ce914c0ea3dd..31ac47ff3554 100644
--- a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
@@ -18,6 +18,7 @@ add_mlir_conversion_library(MLIRGPUToGPURuntimeTransforms
   AttrToSPIRVConverter.cpp
   GPUToLLVMConversion.cpp
   GPUOpsLowering.cpp
+  IndexIntrinsicsOpLowering.cpp
 
   DEPENDS
   MLIRConversionPassIncGen
diff --git a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.cpp b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.cpp
new file mode 100644
index 000000000000..ad3ae74d0c68
--- /dev/null
+++ b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.cpp
@@ -0,0 +1,82 @@
+//===- IndexIntrinsicsOpLowering.cpp - GPU Index Op Lowering --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "IndexIntrinsicsOpLowering.h"
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/IR/BuiltinAttributes.h"
+
+using namespace mlir;
+using namespace mlir::gpu::index_lowering;
+
+LLVM::ConstantRangeAttr mlir::gpu::index_lowering::getIndexOpRange(
+    Operation *op, gpu::Dimension dim, std::optional<uint32_t> opUpperBound,
+    IndexKind indexKind, IntrType intrType, unsigned bitWidth) {
+  // Order of priority for bounds:
+  // 1. The upper_bound attribute
+  // 2. Inherent attributes on a surrounding gpu.func
+  // 3. Discardable attributes on a surrounding function of any kind
+  // The below code handles these in reverse order so that more important
+  // sources overwrite less important ones.
+  DenseI32ArrayAttr funcBounds = nullptr;
+  if (auto funcOp = op->getParentOfType<FunctionOpInterface>()) {
+    switch (indexKind) {
+    case IndexKind::Block: {
+      auto blockHelper =
+          gpu::GPUDialect::KnownBlockSizeAttrHelper(op->getContext());
+      if (blockHelper.isAttrPresent(funcOp))
+        funcBounds = blockHelper.getAttr(funcOp);
+      break;
+    }
+    case IndexKind::Grid: {
+      auto gridHelper =
+          gpu::GPUDialect::KnownGridSizeAttrHelper(op->getContext());
+      if (gridHelper.isAttrPresent(funcOp))
+        funcBounds = gridHelper.getAttr(funcOp);
+      break;
+    }
+    case IndexKind::Cluster: {
+      auto clusterHelper =
+          gpu::GPUDialect::KnownClusterSizeAttrHelper(op->getContext());
+      if (clusterHelper.isAttrPresent(funcOp))
+        funcBounds = clusterHelper.getAttr(funcOp);
+      break;
+    }
+    case IndexKind::Other:
+      break;
+    }
+  }
+  if (auto gpuFunc = op->getParentOfType<gpu::GPUFuncOp>()) {
+    switch (indexKind) {
+    case IndexKind::Block:
+      funcBounds = gpuFunc.getKnownBlockSizeAttr();
+      break;
+    case IndexKind::Grid:
+      funcBounds = gpuFunc.getKnownGridSizeAttr();
+      break;
+    case IndexKind::Cluster:
+      funcBounds = gpuFunc.getKnownClusterSizeAttr();
+      break;
+    case IndexKind::Other:
+      break;
+    }
+  }
+  std::optional<uint32_t> upperBound;
+  if (funcBounds)
+    upperBound = funcBounds.asArrayRef()[static_cast<uint32_t>(dim)];
+  if (opUpperBound)
+    upperBound = *opUpperBound;
+
+  if (!upperBound || intrType == IntrType::None)
+    return nullptr;
+
+  uint32_t min = (intrType == IntrType::Dim ? 1u : 0u);
+  uint32_t max =
+      llvm::SaturatingAdd(*upperBound, (intrType == IntrType::Id ? 0u : 1u));
+  return LLVM::ConstantRangeAttr::get(op->getContext(), bitWidth, min, max);
+}
diff --git a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
index ae0239132e7d..186823e1b40c 100644
--- a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
@@ -1,4 +1,4 @@
-//===- IndexIntrinsicsOpLowering.h - GPU IndexOps Lowering class *- C++ -*-===//
+//===- IndexIntrinsicsOpLowering.h - GPU Index Op Lowering ------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -12,7 +12,6 @@
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/BuiltinAttributes.h"
-#include <limits>
 
 namespace mlir {
 namespace gpu {
@@ -24,6 +23,16 @@ enum class IntrType : uint32_t {
   Dim = 2,
 };
 
+/// Returns a ConstantRangeAttr for a GPU index op, or nullptr if no bounds
+/// are found. `bitWidth` controls the width of the returned range.
+/// Checks the provided upper_bound from the op (highest priority), inherent
+/// attrs on enclosing `gpu.func`s, and discardable attributes on other
+/// enclosing function ops (lowest priority).
+LLVM::ConstantRangeAttr getIndexOpRange(Operation *op, gpu::Dimension dim,
+                                        std::optional<uint32_t> opUpperBound,
+                                        IndexKind indexKind, IntrType intrType,
+                                        unsigned bitWidth);
+
 // Rewriting that replaces Op with XOp, YOp, or ZOp depending on the dimension
 // that Op operates on.  Op is assumed to return an `index` value and
 // XOp, YOp and ZOp are assumed to return an `llvm.i32` value.  Depending on
@@ -54,7 +63,7 @@ public:
   LogicalResult
   matchAndRewrite(Op op, typename Op::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    auto loc = op->getLoc();
+    Location loc = op->getLoc();
     MLIRContext *context = rewriter.getContext();
     Operation *newOp;
     switch (op.getDimension()) {
@@ -69,70 +78,13 @@ public:
       break;
     }
 
-    // Order of priority for bounds:
-    // 1. The upper_bound attribute
-    // 2. Inherent attributes on a surrounding gpu.func
-    // 3. Discardable attributes on a surrounding function of any kind
-    // The below code handles these in reverse order so that more important
-    // sources overwrite less important ones.
-    DenseI32ArrayAttr funcBounds = nullptr;
-    if (auto funcOp = op->template getParentOfType<FunctionOpInterface>()) {
-      switch (indexKind) {
-      case IndexKind::Block: {
-        auto blockHelper =
-            gpu::GPUDialect::KnownBlockSizeAttrHelper(op.getContext());
-        if (blockHelper.isAttrPresent(funcOp))
-          funcBounds = blockHelper.getAttr(funcOp);
-        break;
-      }
-      case IndexKind::Grid: {
-        auto gridHelper =
-            gpu::GPUDialect::KnownGridSizeAttrHelper(op.getContext());
-        if (gridHelper.isAttrPresent(funcOp))
-          funcBounds = gridHelper.getAttr(funcOp);
-        break;
-      }
-      case IndexKind::Cluster: {
-        auto clusterHelper =
-            gpu::GPUDialect::KnownClusterSizeAttrHelper(op.getContext());
-        if (clusterHelper.isAttrPresent(funcOp))
-          funcBounds = clusterHelper.getAttr(funcOp);
-        break;
-      }
-      case IndexKind::Other:
-        break;
-      }
-    }
-    if (auto gpuFunc = op->template getParentOfType<gpu::GPUFuncOp>()) {
-      switch (indexKind) {
-      case IndexKind::Block:
-        funcBounds = gpuFunc.getKnownBlockSizeAttr();
-        break;
-      case IndexKind::Grid:
-        funcBounds = gpuFunc.getKnownGridSizeAttr();
-        break;
-      case IndexKind::Cluster:
-        funcBounds = gpuFunc.getKnownClusterSizeAttr();
-        break;
-      case IndexKind::Other:
-        break;
-      }
-    }
-    std::optional<int32_t> upperBound;
-    if (funcBounds)
-      upperBound =
-          funcBounds.asArrayRef()[static_cast<uint32_t>(op.getDimension())];
-    if (auto opBound = op.getUpperBound())
-      upperBound = opBound->getZExtValue();
+    std::optional<uint32_t> opBound;
+    if (auto bound = op.getUpperBound())
+      opBound = static_cast<uint32_t>(bound->getZExtValue());
+    if (auto range = getIndexOpRange(op, op.getDimension(), opBound, indexKind,
+                                     intrType, /*bitWidth=*/32))
+      newOp->setAttr("range", range);
 
-    if (upperBound && intrType != IntrType::None) {
-      int32_t min = (intrType == IntrType::Dim ? 1 : 0);
-      int32_t max = *upperBound == std::numeric_limits<int32_t>::max()
-                        ? *upperBound
-                        : *upperBound + (intrType == IntrType::Id ? 0 : 1);
-      newOp->setAttr("range", LLVM::ConstantRangeAttr::get(
-                                  rewriter.getContext(), 32, min, max));
-    }
     if (indexBitwidth > 32) {
       newOp = LLVM::SExtOp::create(rewriter, loc,
                                    IntegerType::get(context, indexBitwidth),
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 65353fedc9c4..b1a4627977f8 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -102,6 +102,64 @@ static Value getLaneId(RewriterBase &rewriter, Location loc) {
   return laneId;
 }
 
+/// Maximum number of threads per block dimension on AMD GPUs.
+static constexpr int64_t kMaxThreadsPerBlockDim = 1024;
+
+/// Emits a call to an OCKL block/grid size function corresponding to
+/// `indexKind` with argument `dim`, querying for upper bounds in the context
+/// surrounding `contextOp` as a fallback for an unknown/unavailable
+/// `opUpperBound`.
+static Value getOcklDim(RewriterBase &rewriter,
+                        gpu::index_lowering::IndexKind indexKind,
+                        gpu::Dimension dim, Operation *contextOp,
+                        std::optional<uint32_t> opUpperBound) {
+  Location loc = contextOp->getLoc();
+  MLIRContext *context = contextOp->getContext();
+
+  auto i32Ty = IntegerType::get(context, 32);
+  auto i64Ty = IntegerType::get(context, 64);
+
+  int32_t dimParam = static_cast<int32_t>(dim);
+
+  StringRef functionName;
+  switch (indexKind) {
+  case gpu::index_lowering::IndexKind::Block:
+    functionName = "__ockl_get_local_size";
+    break;
+  case gpu::index_lowering::IndexKind::Grid:
+    functionName = "__ockl_get_num_groups";
+    break;
+  case gpu::index_lowering::IndexKind::Cluster:
+  case gpu::index_lowering::IndexKind::Other:
+    llvm_unreachable("Not valid index kinds for ockl lookup");
+  }
+
+  // Declare the ockl function: i64 @functionName(i32).
+  auto fnType = LLVM::LLVMFunctionType::get(i64Ty, {i32Ty});
+  Operation *moduleOp = contextOp->getParentWithTrait<OpTrait::SymbolTable>();
+  LLVM::LLVMFuncOp funcOp =
+      getOrDefineFunction(moduleOp, loc, rewriter, functionName, fnType);
+
+  // Create the call.
+  Value dimConst = LLVM::ConstantOp::create(rewriter, loc, i32Ty, dimParam);
+  auto callOp =
+      LLVM::CallOp::create(rewriter, loc, funcOp, ValueRange{dimConst});
+
+  // Set range attribute on the call result if bounds are available.
+  auto range = gpu::index_lowering::getIndexOpRange(
+      contextOp, dim, opUpperBound, indexKind,
+      gpu::index_lowering::IntrType::Dim, /*bitWidth=*/64);
+  // Fall back to the hardware limit for block dimensions.
+  if (!range && indexKind == gpu::index_lowering::IndexKind::Block)
+    range = LLVM::ConstantRangeAttr::get(context, APInt(64, 1),
+                                         APInt(64, kMaxThreadsPerBlockDim + 1));
+  if (range) {
+    callOp.setResAttrsAttr(rewriter.getArrayAttr(rewriter.getDictionaryAttr(
+        rewriter.getNamedAttr(LLVM::LLVMDialect::getRangeAttrName(), range))));
+  }
+  return callOp.getResult();
+}
+
 static constexpr StringLiteral amdgcnDataLayout =
     "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
     "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:"
@@ -110,6 +168,36 @@ static constexpr StringLiteral amdgcnDataLayout =
     "64-S32-A5-G1-ni:7:8:9";
 
 namespace {
+
+/// Lowers gpu.block_dim / gpu.grid_dim to direct __ockl_get_local_size /
+/// __ockl_get_num_groups function calls.
+template <typename OpTy>
+struct GPUDimOpToOcklCall final : ConvertOpToLLVMPattern<OpTy> {
+  GPUDimOpToOcklCall(const LLVMTypeConverter &converter,
+                     gpu::index_lowering::IndexKind indexKind)
+      : ConvertOpToLLVMPattern<OpTy>(converter), indexKind(indexKind) {}
+
+  LogicalResult
+  matchAndRewrite(OpTy op, typename OpTy::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+
+    std::optional<uint32_t> opUpperBound;
+    if (auto bound = op.getUpperBound())
+      opUpperBound = static_cast<uint32_t>(bound->getZExtValue());
+
+    Value ocklCall =
+        getOcklDim(rewriter, indexKind, op.getDimension(), op, opUpperBound);
+    Value result = truncOrExtToLLVMType(rewriter, loc, ocklCall,
+                                        *this->getTypeConverter());
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+
+private:
+  const gpu::index_lowering::IndexKind indexKind;
+};
+
 struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
   using ConvertOpToLLVMPattern<gpu::LaneIdOp>::ConvertOpToLLVMPattern;
 
@@ -197,16 +285,37 @@ struct GPUSubgroupIdOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupIdOp> {
       // For older architectures, compute:
       // subgroup_id = linearized_thread_id / subgroup_size
       // where linearized_thread_id = tid.x + dim.x * (tid.y + dim.y * tid.z)
-      Value tidX = ROCDL::ThreadIdXOp::create(rewriter, loc, int32Type);
-      Value tidY = ROCDL::ThreadIdYOp::create(rewriter, loc, int32Type);
-      Value tidZ = ROCDL::ThreadIdZOp::create(rewriter, loc, int32Type);
-      Value dimX = ROCDL::BlockDimXOp::create(rewriter, loc, int32Type);
-      Value dimY = ROCDL::BlockDimYOp::create(rewriter, loc, int32Type);
+      auto tidX = ROCDL::ThreadIdXOp::create(rewriter, loc, int32Type);
+      auto tidY = ROCDL::ThreadIdYOp::create(rewriter, loc, int32Type);
+      auto tidZ = ROCDL::ThreadIdZOp::create(rewriter, loc, int32Type);
+      auto setBoundFromContext = [&](Operation *tidOp, gpu::Dimension dim) {
+        if (LLVM::ConstantRangeAttr range =
+                gpu::index_lowering::getIndexOpRange(
+                    op, dim, std::nullopt,
+                    gpu::index_lowering::IndexKind::Block,
+                    gpu::index_lowering::IntrType::Id, 32))
+          tidOp->setAttr("range", range);
+      };
+      setBoundFromContext(tidX, gpu::Dimension::x);
+      setBoundFromContext(tidY, gpu::Dimension::y);
+      setBoundFromContext(tidZ, gpu::Dimension::z);
+
+      auto flags =
+          LLVM::IntegerOverflowFlags::nsw | LLVM::IntegerOverflowFlags::nuw;
+
+      auto getBlockDim = [&](gpu::Dimension dim) {
+        Value dim64 =
+            getOcklDim(rewriter, gpu::index_lowering::IndexKind::Block, dim, op,
+                       std::nullopt);
+        Value dimTrunc =
+            LLVM::TruncOp::create(rewriter, loc, int32Type, dim64, flags);
+        return dimTrunc;
+      };
+      Value dimX = getBlockDim(gpu::Dimension::x);
+      Value dimY = getBlockDim(gpu::Dimension::y);
 
       // linearized = tid.x + dim.x * (tid.y + dim.y * tid.z)
       // Thread IDs and dimensions are non-negative and small, so use nuw+nsw.
-      auto flags =
-          LLVM::IntegerOverflowFlags::nsw | LLVM::IntegerOverflowFlags::nuw;
       Value dimYxTidZ =
           LLVM::MulOp::create(rewriter, loc, int32Type, dimY, tidZ, flags);
       Value tidYPlusDimYxTidZ =
@@ -626,13 +735,9 @@ void mlir::populateGpuToROCDLConversionPatterns(
   patterns.add<gpu::index_lowering::OpLowering<
       gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
       converter, IndexKind::Grid, IntrType::Id);
-  patterns.add<
-      gpu::index_lowering::OpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
-                                      ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>>(
-      converter, IndexKind::Block, IntrType::Dim);
-  patterns.add<gpu::index_lowering::OpLowering<
-      gpu::GridDimOp, ROCDL::GridDimXOp, ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(
-      converter, IndexKind::Grid, IntrType::Dim);
+  patterns.add<GPUDimOpToOcklCall<gpu::BlockDimOp>>(converter,
+                                                    IndexKind::Block);
+  patterns.add<GPUDimOpToOcklCall<gpu::GridDimOp>>(converter, IndexKind::Grid);
   patterns.add<GPUReturnOpLowering>(converter);
   patterns.add<GPUFuncOpLowering>(
       converter,
diff --git a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
index 8142347d80cb..e1168e75f10d 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
@@ -17,7 +17,6 @@
 #include "mlir/IR/Operation.h"
 #include "mlir/Target/LLVMIR/ModuleTranslation.h"
 
-#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/Support/raw_ostream.h"
@@ -26,32 +25,6 @@ using namespace mlir;
 using namespace mlir::LLVM;
 using mlir::LLVM::detail::createIntrinsicCall;
 
-// Create a call to ROCm-Device-Library function that returns an ID.
-// This is intended to specifically call device functions that fetch things like
-// block or grid dimensions, and so is limited to functions that take one
-// integer parameter.
-static llvm::Value *createDimGetterFunctionCall(llvm::IRBuilderBase &builder,
-                                                Operation *op, StringRef fnName,
-                                                int parameter) {
-  llvm::Module *module = builder.GetInsertBlock()->getModule();
-  llvm::FunctionType *functionType = llvm::FunctionType::get(
-      llvm::Type::getInt64Ty(module->getContext()), // return type.
-      llvm::Type::getInt32Ty(module->getContext()), // parameter type.
-      false);                                       // no variadic arguments.
-  llvm::Function *fn = dyn_cast<llvm::Function>(
-      module->getOrInsertFunction(fnName, functionType).getCallee());
-  llvm::Value *fnOp0 = llvm::ConstantInt::get(
-      llvm::Type::getInt32Ty(module->getContext()), parameter);
-  auto *call = builder.CreateCall(fn, ArrayRef<llvm::Value *>(fnOp0));
-  if (auto rangeAttr = op->getAttrOfType<LLVM::ConstantRangeAttr>("range")) {
-    // Zero-extend to 64 bits because the GPU dialect uses 32-bit bounds but
-    // these ockl functions are defined to be 64-bits
-    call->addRangeRetAttr(llvm::ConstantRange(rangeAttr.getLower().zext(64),
-                                              rangeAttr.getUpper().zext(64)));
-  }
-  return call;
-}
-
 namespace {
 /// Implementation of the dialect interface that converts operations belonging
 /// to the ROCDL dialect to LLVM IR.
diff --git a/mlir/test/Conversion/GPUCommon/lower-global-id.mlir b/mlir/test/Conversion/GPUCommon/lower-global-id.mlir
index b0274e0f9f29..94b9f9005276 100644
--- a/mlir/test/Conversion/GPUCommon/lower-global-id.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-global-id.mlir
@@ -11,9 +11,8 @@ gpu.module @kernel {
 // ROCDL-LABEL:   llvm.func @gpu_global_id() -> i64 {
 // ROCDL:           %[[WORKGROUP_0:.*]] = rocdl.workgroup.id.x : i32
 // ROCDL:           %[[SEXT_0:.*]] = llvm.sext %[[WORKGROUP_0]] : i32 to i64
-// ROCDL:           %[[WORKGROUP_1:.*]] = rocdl.workgroup.dim.x : i32
-// ROCDL:           %[[SEXT_1:.*]] = llvm.sext %[[WORKGROUP_1]] : i32 to i64
-// ROCDL:           %[[MUL_0:.*]] = llvm.mul %[[SEXT_0]], %[[SEXT_1]] : i64
+// ROCDL:           %[[DIM64:.*]] = llvm.call @__ockl_get_local_size(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 1025>})
+// ROCDL:           %[[MUL_0:.*]] = llvm.mul %[[SEXT_0]], %[[DIM64]] : i64
 // ROCDL:           %[[WORKITEM_0:.*]] = rocdl.workitem.id.x : i32
 // ROCDL:           %[[SEXT_2:.*]] = llvm.sext %[[WORKITEM_0]] : i32 to i64
 // ROCDL:           %[[ADD_0:.*]] = llvm.add %[[SEXT_2]], %[[MUL_0]] : i64
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir
index 030eb0e5eb18..b44216aab3bf 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir
@@ -4,36 +4,66 @@
 gpu.module @test_module {
 // CHECK-LABEL: func @subgroup_id()
 func.func @subgroup_id() -> index {
-  // GFX12: rocdl.wave.id : i32
-  // GFX12: llvm.sext %{{.*}} : i32 to i64
+  // GFX12: %[[WAVEID:.+]] = rocdl.wave.id : i32
+  // GFX12: llvm.sext %[[WAVEID]] : i32 to i64
 
-  // GFX9-DAG: rocdl.workitem.id.x : i32
-  // GFX9-DAG: rocdl.workitem.id.y : i32
-  // GFX9-DAG: rocdl.workitem.id.z : i32
-  // GFX9-DAG: rocdl.workgroup.dim.x : i32
-  // GFX9-DAG: rocdl.workgroup.dim.y : i32
-  // GFX9-DAG: llvm.mul %{{.*}}, %{{.*}} overflow<nsw, nuw>
-  // GFX9-DAG: llvm.add %{{.*}}, %{{.*}} overflow<nsw, nuw>
-  // GFX9: rocdl.wavefrontsize : i32
-  // GFX9: llvm.udiv
-  // GFX9: llvm.sext %{{.*}} : i32 to i64
+  // GFX9-DAG: %[[IDX:.+]] = rocdl.workitem.id.x : i32
+  // GFX9-DAG: %[[IDY:.+]] = rocdl.workitem.id.y : i32
+  // GFX9-DAG: %[[IDZ:.+]] = rocdl.workitem.id.z : i32
+  // GFX9-DAG: %[[DIMX_I64:.+]] = llvm.call @__ockl_get_local_size(%[[C0:.+]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 1025>})
+  // Yes, this is checking after the call that uses it. This prevents collisions with other 0s.
+  // GFX9-DAG: %[[C0]] = llvm.mlir.constant(0 : i32) : i32
+  // GFX9-DAG: %[[DIMX:.+]] = llvm.trunc %[[DIMX_I64]] overflow<nsw, nuw> : i64 to i32
+  // GFX9-DAG: %[[DIMY_I64:.+]] = llvm.call @__ockl_get_local_size(%[[C1:.+]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 1025>})
+  // GFX9-DAG: %[[C1]] = llvm.mlir.constant(1 : i32) : i32
+  // GFX9-DAG: %[[DIMY:.+]] = llvm.trunc %[[DIMY_I64]] overflow<nsw, nuw> : i64 to i32
+  // GFX9: %[[Z_DY:.+]] = llvm.mul %[[DIMY]], %[[IDZ]] overflow<nsw, nuw>
+  // GFX9: %[[ZY:.+]] = llvm.add %[[IDY]], %[[Z_DY]] overflow<nsw, nuw>
+  // GFX9: %[[YZ_DX:.+]] = llvm.mul %[[DIMX]], %[[ZY]] overflow<nsw, nuw>
+  // GFX9: %[[ZYX:.+]] = llvm.add %[[IDX]], %[[YZ_DX]] overflow<nsw, nuw>
+  // GFX9: %[[WAVESZ:.+]] = rocdl.wavefrontsize : i32
+  // GFX9: %[[RES:.+]] = llvm.udiv %[[ZYX]], %[[WAVESZ]]
+  // GFX9: llvm.sext %[[RES]] : i32 to i64
   %subgroupId = gpu.subgroup_id : index
   func.return %subgroupId : index
 }
 
 // CHECK-LABEL: func @subgroup_id_with_upper_bound()
 func.func @subgroup_id_with_upper_bound() -> index {
-  // GFX12: rocdl.wave.id range <i32, 0, 4> : i32
-  // GFX12: llvm.sext %{{.*}} : i32 to i64
+  // GFX12: %[[WAVEID:.+]] = rocdl.wave.id range <i32, 0, 4> : i32
+  // GFX12: llvm.sext %[[WAVEID]] : i32 to i64
 
-  // GFX9-DAG: rocdl.workitem.id.x : i32
-  // GFX9-DAG: rocdl.workitem.id.y : i32
-  // GFX9-DAG: rocdl.workitem.id.z : i32
-  // GFX9-DAG: rocdl.workgroup.dim.x : i32
-  // GFX9-DAG: rocdl.workgroup.dim.y : i32
-  // GFX9: rocdl.wavefrontsize : i32
-  // GFX9: llvm.udiv
-  // GFX9: llvm.sext %{{.*}} : i32 to i64
+  // Minimal check to ensure we don't set any bounds based on the subgroup ID bound
+  // since we don't know which thread ID they go on to.
+  // GFX9: rocdl.workitem.id.x : i32
+  // GFX9-DAG: llvm.call @__ockl_get_local_size({{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 1025>})
+  %subgroupId = gpu.subgroup_id upper_bound 4 : index
+  func.return %subgroupId : index
+}
+
+// CHECK-LABEL: func @subgroup_id_with_workgroup_sizes()
+func.func @subgroup_id_with_workgroup_sizes() -> index
+    attributes {gpu.known_block_size = array<i32: 64, 4, 1>} {
+  // GFX12: %[[WAVEID:.+]] = rocdl.wave.id range <i32, 0, 4> : i32
+  // GFX12: llvm.sext %[[WAVEID]] : i32 to i64
+
+  // GFX9-DAG: %[[IDX:.+]] = rocdl.workitem.id.x range <i32, 0, 64> : i32
+  // GFX9-DAG: %[[IDY:.+]] = rocdl.workitem.id.y range <i32, 0, 4> : i32
+  // GFX9-DAG: %[[IDZ:.+]] = rocdl.workitem.id.z range <i32, 0, 1> : i32
+  // GFX9-DAG: %[[DIMX_I64:.+]] = llvm.call @__ockl_get_local_size(%[[C0:.+]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 65>})
+  // Yes, this is checking after the call that uses it. This prevents collisions with other 0s.
+  // GFX9-DAG: %[[C0]] = llvm.mlir.constant(0 : i32) : i32
+  // GFX9-DAG: %[[DIMX:.+]] = llvm.trunc %[[DIMX_I64]] overflow<nsw, nuw> : i64 to i32
+  // GFX9-DAG: %[[DIMY_I64:.+]] = llvm.call @__ockl_get_local_size(%[[C1:.+]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 5>})
+  // GFX9-DAG: %[[C1]] = llvm.mlir.constant(1 : i32) : i32
+  // GFX9-DAG: %[[DIMY:.+]] = llvm.trunc %[[DIMY_I64]] overflow<nsw, nuw> : i64 to i32
+  // GFX9: %[[Z_DY:.+]] = llvm.mul %[[DIMY]], %[[IDZ]] overflow<nsw, nuw>
+  // GFX9: %[[ZY:.+]] = llvm.add %[[IDY]], %[[Z_DY]] overflow<nsw, nuw>
+  // GFX9: %[[YZ_DX:.+]] = llvm.mul %[[DIMX]], %[[ZY]] overflow<nsw, nuw>
+  // GFX9: %[[ZYX:.+]] = llvm.add %[[IDX]], %[[YZ_DX]] overflow<nsw, nuw>
+  // GFX9: %[[WAVESZ:.+]] = rocdl.wavefrontsize : i32
+  // GFX9: %[[RES:.+]] = llvm.udiv %[[ZYX]], %[[WAVESZ]]
+  // GFX9: llvm.sext %[[RES]] : i32 to i64
   %subgroupId = gpu.subgroup_id upper_bound 4 : index
   func.return %subgroupId : index
 }
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index 9c5c6c7cf9c8..3cc9ded6fe91 100755
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -24,14 +24,14 @@ gpu.module @test_module {
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
     %tIdZ = gpu.thread_id z
 
-    // CHECK: rocdl.workgroup.dim.x : i32
-    // CHECK: = llvm.sext %{{.*}} : i32 to i64
+    // CHECK-DAG: %[[BD_C0:.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK-DAG: %[[BD_C1:.*]] = llvm.mlir.constant(1 : i32) : i32
+    // CHECK-DAG: %[[BD_C2:.*]] = llvm.mlir.constant(2 : i32) : i32
+    // CHECK-DAG: llvm.call @__ockl_get_local_size(%[[BD_C0]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 1025>})
     %bDimX = gpu.block_dim x
-    // CHECK: rocdl.workgroup.dim.y : i32
-    // CHECK: = llvm.sext %{{.*}} : i32 to i64
+    // CHECK-DAG: llvm.call @__ockl_get_local_size(%[[BD_C1]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 1025>})
     %bDimY = gpu.block_dim y
-    // CHECK: rocdl.workgroup.dim.z : i32
-    // CHECK: = llvm.sext %{{.*}} : i32 to i64
+    // CHECK-DAG: llvm.call @__ockl_get_local_size(%[[BD_C2]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 1025>})
     %bDimZ = gpu.block_dim z
 
     // CHECK: rocdl.workgroup.id.x : i32
@@ -44,14 +44,14 @@ gpu.module @test_module {
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
     %bIdZ = gpu.block_id z
 
-    // CHECK: rocdl.grid.dim.x : i32
-    // CHECK: = llvm.sext %{{.*}} : i32 to i64
+    // CHECK-DAG: %[[GD_C0:.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK-DAG: %[[GD_C1:.*]] = llvm.mlir.constant(1 : i32) : i32
+    // CHECK-DAG: %[[GD_C2:.*]] = llvm.mlir.constant(2 : i32) : i32
+    // CHECK-DAG: llvm.call @__ockl_get_num_groups(%[[GD_C0]]) : (i32) -> i64
     %gDimX = gpu.grid_dim x
-    // CHECK: rocdl.grid.dim.y : i32
-    // CHECK: = llvm.sext %{{.*}} : i32 to i64
+    // CHECK-DAG: llvm.call @__ockl_get_num_groups(%[[GD_C1]]) : (i32) -> i64
     %gDimY = gpu.grid_dim y
-    // CHECK: rocdl.grid.dim.z : i32
-    // CHECK: = llvm.sext %{{.*}} : i32 to i64
+    // CHECK-DAG: llvm.call @__ockl_get_num_groups(%[[GD_C2]]) : (i32) -> i64
     %gDimZ = gpu.grid_dim z
 
     // CHECK: = rocdl.mbcnt.lo %{{.*}}, %{{.*}} {res_attrs = [{llvm.noundef, llvm.range = #llvm.constant_range<i32, 0, 32>}]} : (i32, i32) -> i32
@@ -100,13 +100,33 @@ gpu.module @test_module {
     // CHECK: rocdl.workgroup.id.z range <i32, 0, 28> : i32
     %bIdZ = gpu.block_id z
 
+    // CHECK: llvm.call @__ockl_get_local_size(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 9>})
+    %bDimX = gpu.block_dim x
+    // CHECK: llvm.call @__ockl_get_local_size(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 13>})
+    %bDimY = gpu.block_dim y
+    // CHECK: llvm.call @__ockl_get_local_size(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 17>})
+    %bDimZ = gpu.block_dim z
+
+    // CHECK: llvm.call @__ockl_get_num_groups(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 21>})
+    %gDimX = gpu.grid_dim x
+    // CHECK: llvm.call @__ockl_get_num_groups(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 25>})
+    %gDimY = gpu.grid_dim y
+    // CHECK: llvm.call @__ockl_get_num_groups(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 29>})
+    %gDimZ = gpu.grid_dim z
+
     // "Usage" to make the ID calls not die
     %0 = arith.addi %tIdX, %tIdY : index
     %1 = arith.addi %0, %tIdZ : index
     %2 = arith.addi %1, %bIdX : index
     %3 = arith.addi %2, %bIdY : index
     %4 = arith.addi %3, %bIdZ : index
-    %5 = arith.index_cast %4 : index to i32
+    %r0 = arith.addi %4, %bDimX : index
+    %r1 = arith.addi %r0, %bDimY : index
+    %r2 = arith.addi %r1, %bDimZ : index
+    %r3 = arith.addi %r2, %gDimX : index
+    %r4 = arith.addi %r3, %gDimY : index
+    %r5 = arith.addi %r4, %gDimZ : index
+    %5 = arith.index_cast %r5 : index to i32
     memref.store %5, %place[] : memref<i32>
     gpu.return
   }
@@ -796,7 +816,8 @@ gpu.module @test_module {
   func.func @gpu_dim_int_max_upper_bound()
       -> (index) {
 
-    // CHECK32: rocdl.workgroup.dim.x range <i32, 1, 2147483647> : i32
+    // CHECK32: llvm.call @__ockl_get_local_size(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 2147483648>})
+    // CHECK32: llvm.trunc %{{.*}} : i64 to i32
     %bDimX = gpu.block_dim x upper_bound 2147483647
     func.return %bDimX : index
   }
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index 1a810dce706b..1d835b352e51 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -26,20 +26,8 @@ func.func @rocdl_special_regs() -> i32 {
   %10 = rocdl.cluster.workgroup.id.y : i32
   // CHECK: rocdl.cluster.workgroup.id.z : i32
   %11 = rocdl.cluster.workgroup.id.z : i32
-  // CHECK: rocdl.workgroup.dim.x : i32
-  %12 = rocdl.workgroup.dim.x : i32
-  // CHECK: rocdl.workgroup.dim.y : i32
-  %13 = rocdl.workgroup.dim.y : i32
-  // CHECK: rocdl.workgroup.dim.z : i32
-  %14 = rocdl.workgroup.dim.z : i32
-  // CHECK: rocdl.grid.dim.x : i32
-  %15 = rocdl.grid.dim.x : i32
-  // CHECK: rocdl.grid.dim.y : i32
-  %16 = rocdl.grid.dim.y : i32
-  // CHECK: rocdl.grid.dim.z : i32
-  %17 = rocdl.grid.dim.z : i32
   // CHECK: rocdl.wave.id : i32
-  %18 = rocdl.wave.id : i32
+  %12 = rocdl.wave.id : i32
   llvm.return %0 : i32
 }
 
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 78a78c0bd1bb..d3ac38ecad32 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -27,37 +27,20 @@ llvm.func @rocdl_special_regs() -> i32 {
   // CHECK: call range(i32 0, 16) i32 @llvm.amdgcn.cluster.workgroup.id.z()
   %12 = rocdl.cluster.workgroup.id.z range <i32, 0, 16> : i32
 
-  // CHECK: call i64 @__ockl_get_local_size(i32 0)
-  %13 = rocdl.workgroup.dim.x : i64
-  // CHECK: call i64 @__ockl_get_local_size(i32 1)
-  %14 = rocdl.workgroup.dim.y : i64
-  // CHECK: call i64 @__ockl_get_local_size(i32 2)
-  %15 = rocdl.workgroup.dim.z : i64
-
-  // CHECK: call i64 @__ockl_get_num_groups(i32 0)
-  %16 = rocdl.grid.dim.x : i64
-  // CHECK: call i64 @__ockl_get_num_groups(i32 1)
-  %17 = rocdl.grid.dim.y : i64
-  // CHECK: call i64 @__ockl_get_num_groups(i32 2)
-  %18 = rocdl.grid.dim.z : i64
-
   // CHECK: call range(i32 0, 64) i32 @llvm.amdgcn.workitem.id.x()
-  %19 = rocdl.workitem.id.x range <i32, 0, 64> : i32
-
-  // CHECK: call range(i64 1, 65) i64 @__ockl_get_local_size(i32 0)
-  %20 = rocdl.workgroup.dim.x range <i32, 1, 65> : i64
+  %13 = rocdl.workitem.id.x range <i32, 0, 64> : i32
 
   // CHECK: call i32 @llvm.amdgcn.wave.id()
-  %21 = rocdl.wave.id : i32
+  %14 = rocdl.wave.id : i32
 
   // CHECK: call range(i32 32, 65) i32 @llvm.amdgcn.wave.id()
-  %22 = rocdl.wave.id range <i32, 32, 65> : i32
+  %15 = rocdl.wave.id range <i32, 32, 65> : i32
 
   // CHECK: call i32 @llvm.amdgcn.wavefrontsize()
-  %23 = rocdl.wavefrontsize : i32
+  %16 = rocdl.wavefrontsize : i32
 
   // CHECK: call range(i32 32, 65) i32 @llvm.amdgcn.wavefrontsize()
-  %24 = rocdl.wavefrontsize range <i32, 32, 65> : i32
+  %17 = rocdl.wavefrontsize range <i32, 32, 65> : i32
 
   llvm.return %1 : i32
 }