[NVPTX] Convert vector function nvvm.annotations to attributes (#127736)

Replace some more nvvm.annotations with function attributes, auto-upgrading the annotations as needed. These new attributes will be more idiomatic and compile-time efficient than the annotations. - !"maxntid[xyz]" -> "nvvm.maxntid" - !"reqntid[xyz]" -> "nvvm.reqntid" - !"cluster_dim_[xyz]" -> "nvvm.cluster_dim"
2025-02-26 08:45:27 -08:00 · 2025-02-26 08:45:27 -08:00 · 6c2e170d04
commit 6c2e170d04
parent 14da7d5c1f
23 changed files with 303 additions and 314 deletions
--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
+++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
@ -357,17 +357,13 @@ void CodeGenModule::handleCUDALaunchBoundsAttr(llvm::Function *F,
                                               int32_t *MaxThreadsVal,
                                               int32_t *MinBlocksVal,
                                               int32_t *MaxClusterRankVal) {
-  // Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
  llvm::APSInt MaxThreads(32);
  MaxThreads = Attr->getMaxThreads()->EvaluateKnownConstInt(getContext());
  if (MaxThreads > 0) {
    if (MaxThreadsVal)
      *MaxThreadsVal = MaxThreads.getExtValue();
-    if (F) {
-      // Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
-      NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx",
-                                              MaxThreads.getExtValue());
-    }
+    if (F)
+      F->addFnAttr("nvvm.maxntid", llvm::utostr(MaxThreads.getExtValue()));
  }

  // min and max blocks is an optional argument for CUDALaunchBoundsAttr. If it
--- a/clang/test/CodeGenCUDA/launch-bounds.cu
+++ b/clang/test/CodeGenCUDA/launch-bounds.cu
@ -10,23 +10,30 @@
 #endif

 // CHECK: @Kernel1() #[[ATTR0:[0-9]+]]
+// CHECK: @Kernel2() #[[ATTR1:[0-9]+]]
+// CHECK: @{{.*}}Kernel3{{.*}}() #[[ATTR1]]
 // CHECK: @{{.*}}Kernel4{{.*}}() #[[ATTR0]]
-// CHECK: @{{.*}}Kernel5{{.*}}() #[[ATTR1:[0-9]+]]
-// CHECK: @{{.*}}Kernel6{{.*}}() #[[ATTR0]]
-// CHECK: @{{.*}}Kernel8{{.*}}() #[[ATTR3:[0-9]+]]
+// CHECK: @{{.*}}Kernel5{{.*}}() #[[ATTR2:[0-9]+]]
+// CHECK: @{{.*}}Kernel6{{.*}}() #[[ATTR3:[0-9]+]]
+// CHECK: @{{.*}}Kernel7{{.*}}() #[[ATTR1]]
+// CHECK: @{{.*}}Kernel8{{.*}}() #[[ATTR4:[0-9]+]]

-// CHECK: attributes #[[ATTR0]] = {{{.*}} "nvvm.minctasm"="2" {{.*}}}
-// CHECK: attributes #[[ATTR1]] = {{{.*}} "nvvm.minctasm"="258" {{.*}}}
-// CHECK: attributes #[[ATTR3]] = {{{.*}} "nvvm.minctasm"="12" {{.*}}}
+// CHECK-DAG: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxntid"="256" "nvvm.minctasm"="2" {{.*}}}
+// CHECK-DAG: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxntid"="256" {{.*}}}
+// CHECK-DAG: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="356" "nvvm.minctasm"="258" {{.*}}}
+// CHECK-DAG: attributes #[[ATTR3]] = {{{.*}} "nvvm.minctasm"="2" {{.*}}}
+// CHECK-DAG: attributes #[[ATTR4]] = {{{.*}} "nvvm.maxntid"="100" "nvvm.minctasm"="12" {{.*}}}

-// CHECK_MAX_BLOCKS: @Kernel1_sm_90() #[[ATTR4:[0-9]+]]
-// CHECK_MAX_BLOCKS: @{{.*}}Kernel4_sm_90{{.*}} #[[ATTR4]]
-// CHECK_MAX_BLOCKS: @{{.*}}Kernel5_sm_90{{.*}} #[[ATTR5:[0-9]+]]
-// CHECK_MAX_BLOCKS: @{{.*}}Kernel8_sm_90{{.*}} #[[ATTR6:[0-9]+]]
+// CHECK_MAX_BLOCKS: @Kernel1_sm_90() #[[ATTR0:[0-9]+]]
+// CHECK_MAX_BLOCKS: @{{.*}}Kernel4_sm_90{{.*}} #[[ATTR0]]
+// CHECK_MAX_BLOCKS: @{{.*}}Kernel5_sm_90{{.*}} #[[ATTR1:[0-9]+]]
+// CHECK_MAX_BLOCKS: @{{.*}}Kernel7_sm_90{{.*}} #[[ATTR2:[0-9]+]]
+// CHECK_MAX_BLOCKS: @{{.*}}Kernel8_sm_90{{.*}} #[[ATTR3:[0-9]+]]

-// CHECK_MAX_BLOCKS: attributes #[[ATTR4]] = {{{.*}} "nvvm.maxclusterrank"="4" "nvvm.minctasm"="2" {{.*}}}
-// CHECK_MAX_BLOCKS: attributes #[[ATTR5]] = {{{.*}} "nvvm.maxclusterrank"="260" "nvvm.minctasm"="258" {{.*}}}
-// CHECK_MAX_BLOCKS: attributes #[[ATTR6]] = {{{.*}} "nvvm.maxclusterrank"="14" "nvvm.minctasm"="12" {{.*}}}
+// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxclusterrank"="4" "nvvm.maxntid"="256" "nvvm.minctasm"="2" {{.*}}}
+// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxclusterrank"="260" "nvvm.maxntid"="356" "nvvm.minctasm"="258" {{.*}}}
+// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="256" {{.*}}}
+// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR3]] = {{{.*}} "nvvm.maxclusterrank"="14" "nvvm.maxntid"="100" "nvvm.minctasm"="12" {{.*}}}

 // Test both max threads per block and Min cta per sm.
 extern "C" {
@ -37,8 +44,6 @@ Kernel1()
 }
 }

-// CHECK: !{{[0-9]+}} = !{ptr @Kernel1, !"maxntidx", i32 256}
-
 #ifdef USE_MAX_BLOCKS
 // Test max threads per block and min/max cta per sm.
 extern "C" {
@ -48,8 +53,6 @@ Kernel1_sm_90()
 {
 }
 }
-
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"maxntidx", i32 256}
 #endif // USE_MAX_BLOCKS

 // Test only max threads per block. Min cta per sm defaults to 0, and
@ -62,8 +65,6 @@ Kernel2()
 }
 }

-// CHECK: !{{[0-9]+}} = !{ptr @Kernel2, !"maxntidx", i32 256}
-
 template <int max_threads_per_block>
 __global__ void
 __launch_bounds__(max_threads_per_block)
@ -72,7 +73,6 @@ Kernel3()
 }

 template __global__ void Kernel3<MAX_THREADS_PER_BLOCK>();
-// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel3{{.*}}, !"maxntidx", i32 256}

 template <int max_threads_per_block, int min_blocks_per_mp>
 __global__ void
@ -82,7 +82,6 @@ Kernel4()
 }
 template __global__ void Kernel4<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();

-// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4{{.*}}, !"maxntidx", i32 256}

 #ifdef USE_MAX_BLOCKS
 template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp>
@ -93,7 +92,6 @@ Kernel4_sm_90()
 }
 template __global__ void Kernel4_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();

-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"maxntidx", i32 256}
 #endif //USE_MAX_BLOCKS

 const int constint = 100;
@ -106,8 +104,6 @@ Kernel5()
 }
 template __global__ void Kernel5<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();

-// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5{{.*}}, !"maxntidx", i32 356}
-
 #ifdef USE_MAX_BLOCKS

 template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp>
@ -120,7 +116,6 @@ Kernel5_sm_90()
 }
 template __global__ void Kernel5_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();

-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"maxntidx", i32 356}
 #endif //USE_MAX_BLOCKS

 // Make sure we don't emit negative launch bounds values.
@ -129,15 +124,12 @@ __launch_bounds__( -MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP )
 Kernel6()
 {
 }
-// CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel6{{.*}}, !"maxntidx",

 __global__ void
 __launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP )
 Kernel7()
 {
 }
-// CHECK:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, !"maxntidx",
-// CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, !"minctasm",

 #ifdef USE_MAX_BLOCKS
 __global__ void
@ -145,17 +137,12 @@ __launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP, -MAX_BLOCKS_PER_MP
 Kernel7_sm_90()
 {
 }
-// CHECK_MAX_BLOCKS:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"maxntidx",
-// CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"minctasm",
-// CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"maxclusterrank",
 #endif // USE_MAX_BLOCKS

 const char constchar = 12;
 __global__ void __launch_bounds__(constint, constchar) Kernel8() {}
-// CHECK:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8{{.*}}, !"maxntidx", i32 100

 #ifdef USE_MAX_BLOCKS
 const char constchar_2 = 14;
 __global__ void __launch_bounds__(constint, constchar, constchar_2) Kernel8_sm_90() {}
-// CHECK_MAX_BLOCKS:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"maxntidx", i32 100
 #endif // USE_MAX_BLOCKS
--- a/clang/test/OpenMP/ompx_attributes_codegen.cpp
+++ b/clang/test/OpenMP/ompx_attributes_codegen.cpp
@ -11,9 +11,13 @@

 // Check that the target attributes are set on the generated kernel
 void func() {
-  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l18(ptr {{[^,]+}}) #0
-  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l20(ptr {{[^,]+}})
-  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #4
+  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #0
+  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l24(ptr {{[^,]+}})
+  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l26(ptr {{[^,]+}}) #4
+
+  // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #[[ATTR0:[0-9]+]]
+  // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l24(ptr {{[^,]+}}) #[[ATTR1:[0-9]+]]
+  // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l26(ptr {{[^,]+}}) #[[ATTR2:[0-9]+]]

  #pragma omp target ompx_attribute([[clang::amdgpu_flat_work_group_size(10, 20)]])
  {}
@ -34,9 +38,12 @@ void func() {
 // AMD-SAME: "omp_target_thread_limit"="17"

 // It is unclear if we should use the AMD annotations for other targets, we do for now.
-// NVIDIA: "omp_target_thread_limit"="20"
-// NVIDIA: "omp_target_thread_limit"="45"
-// NVIDIA: "omp_target_thread_limit"="17"
-// NVIDIA: !{ptr @__omp_offloading[[HASH1:.*]]_l18, !"maxntidx", i32 20}
-// NVIDIA: !{ptr @__omp_offloading[[HASH2:.*]]_l20, !"maxntidx", i32 45}
-// NVIDIA: !{ptr @__omp_offloading[[HASH3:.*]]_l22, !"maxntidx", i32 17}
+// NVIDIA: attributes #[[ATTR0]]
+// NVIDIA-SAME: "nvvm.maxntid"="20"
+// NVIDIA-SAME: "omp_target_thread_limit"="20"
+// NVIDIA: attributes #[[ATTR1]]
+// NVIDIA-SAME: "nvvm.maxntid"="45"
+// NVIDIA-SAME: "omp_target_thread_limit"="45"
+// NVIDIA: attributes #[[ATTR2]]
+// NVIDIA-SAME: "nvvm.maxntid"="17"
+// NVIDIA-SAME: "omp_target_thread_limit"="17"
--- a/clang/test/OpenMP/thread_limit_nvptx.c
+++ b/clang/test/OpenMP/thread_limit_nvptx.c
@ -7,23 +7,21 @@
 #define HEADER

 void foo(int N) {
-// CHECK: l11, !"maxntidx", i32 128}
+// CHECK: define {{.*}}l11{{.*}} #[[ATTR0:[0-9]+]]
 #pragma omp target teams distribute parallel for simd
  for (int i = 0; i < N; ++i)
    ;
-// CHECK: l15, !"maxntidx", i32 4}
+// CHECK: define {{.*}}l15{{.*}} #[[ATTR1:[0-9]+]]
 #pragma omp target teams distribute parallel for simd thread_limit(4)
  for (int i = 0; i < N; ++i)
    ;
-// CHECK-NOT: l21, !"maxntidx", i32 128}
-// CHECK: l21, !"maxntidx", i32 42}
-// CHECK-NOT: l21, !"maxntidx", i32 128}
+
+// CHECK: define {{.*}}l20{{.*}} #[[ATTR2:[0-9]+]]
 #pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42))))
  for (int i = 0; i < N; ++i)
    ;
-// CHECK-NOT: l27, !"maxntidx", i32 42}
-// CHECK: l27, !"maxntidx", i32 22}
-// CHECK-NOT: l27, !"maxntidx", i32 42}
+
+// CHECK: define {{.*}}l25{{.*}} #[[ATTR3:[0-9]+]]
 #pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) num_threads(22)
  for (int i = 0; i < N; ++i)
    ;
@ -31,3 +29,7 @@ void foo(int N) {

 #endif

+// CHECK: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxntid"="128" {{.*}}}
+// CHECK: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxntid"="4" {{.*}}}
+// CHECK: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="42" {{.*}}}
+// CHECK: attributes #[[ATTR3]] = {{{.*}} "nvvm.maxntid"="22" {{.*}}}
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@ -74,6 +74,23 @@ Function Attributes
    This attribute indicates the maximum number of registers to be used for the
    kernel function.

+``"nvvm.maxntid"="<x>[,<y>[,<z>]]"``
+    This attribute declares the maximum number of threads in the thread block
+    (CTA). The maximum number of threads is the product of the maximum extent in
+    each dimension. Exceeding the maximum number of threads results in a runtime
+    error or kernel launch failure.
+
+``"nvvm.reqntid"="<x>[,<y>[,<z>]]"``
+    This attribute declares the exact number of threads in the thread block
+    (CTA). The number of threads is the product of the value in each dimension.
+    Specifying a different CTA dimension at launch will result in a runtime 
+    error or kernel launch failure.
+
+``"nvvm.cluster_dim"="<x>[,<y>[,<z>]]"``
+    This attribute declares the number of thread blocks (CTAs) in the cluster.
+    The total number of CTAs is the product of the number of CTAs in each 
+    dimension. Specifying a different cluster dimension at launch will result in
+    a runtime error or kernel launch failure. Only supported for Hopper+.

 .. _address_spaces:

--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@ -6406,45 +6406,13 @@ void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
  KernelEnvironmentGV->setInitializer(NewInitializer);
 }

-static MDNode *getNVPTXMDNode(Function &Kernel, StringRef Name) {
-  Module &M = *Kernel.getParent();
-  NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
-  for (auto *Op : MD->operands()) {
-    if (Op->getNumOperands() != 3)
-      continue;
-    auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
-    if (!KernelOp || KernelOp->getValue() != &Kernel)
-      continue;
-    auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
-    if (!Prop || Prop->getString() != Name)
-      continue;
-    return Op;
-  }
-  return nullptr;
-}
-
-static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value,
-                                bool Min) {
-  // Update the "maxntidx" metadata for NVIDIA, or add it.
-  MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name);
-  if (ExistingOp) {
-    auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
-    int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
-    ExistingOp->replaceOperandWith(
-        2, ConstantAsMetadata::get(ConstantInt::get(
-               OldVal->getValue()->getType(),
-               Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value))));
-  } else {
-    LLVMContext &Ctx = Kernel.getContext();
-    Metadata *MDVals[] = {ConstantAsMetadata::get(&Kernel),
-                          MDString::get(Ctx, Name),
-                          ConstantAsMetadata::get(
-                              ConstantInt::get(Type::getInt32Ty(Ctx), Value))};
-    // Append metadata to nvvm.annotations
-    Module &M = *Kernel.getParent();
-    NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
-    MD->addOperand(MDNode::get(Ctx, MDVals));
+static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
+                            bool Min) {
+  if (Kernel.hasFnAttribute(Name)) {
+    int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
+    Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
  }
+  Kernel.addFnAttr(Name, llvm::utostr(Value));
 }

 std::pair<int32_t, int32_t>
@ -6466,9 +6434,8 @@ OpenMPIRBuilder::readThreadBoundsForKernel(const Triple &T, Function &Kernel) {
    return {LB, UB};
  }

-  if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) {
-    auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
-    int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
+  if (Kernel.hasFnAttribute("nvvm.maxntid")) {
+    int32_t UB = Kernel.getFnAttributeAsParsedInteger("nvvm.maxntid");
    return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
  }
  return {0, ThreadLimit};
@ -6485,7 +6452,7 @@ void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple &T,
    return;
  }

-  updateNVPTXMetadata(Kernel, "maxntidx", UB, true);
+  updateNVPTXAttr(Kernel, "nvvm.maxntid", UB, true);
 }

 std::pair<int32_t, int32_t>
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@ -13,11 +13,13 @@
 //===----------------------------------------------------------------------===//

 #include "llvm/IR/AutoUpgrade.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/AttributeMask.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
@ -46,6 +48,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/TargetParser/Triple.h"
+#include <cstdint>
 #include <cstring>
 #include <numeric>

@ -5021,6 +5024,43 @@ bool llvm::UpgradeDebugInfo(Module &M) {
  return Modified;
 }

+static void upgradeNVVMFnVectorAttr(const StringRef Attr, const char DimC,
+                                    GlobalValue *GV, const Metadata *V) {
+  Function *F = cast<Function>(GV);
+
+  constexpr StringLiteral DefaultValue = "1";
+  StringRef Vect3[3] = {DefaultValue, DefaultValue, DefaultValue};
+  unsigned Length = 0;
+
+  if (F->hasFnAttribute(Attr)) {
+    // We expect the existing attribute to have the form "x[,y[,z]]". Here we
+    // parse these elements placing them into Vect3
+    StringRef S = F->getFnAttribute(Attr).getValueAsString();
+    for (; Length < 3 && !S.empty(); Length++) {
+      auto [Part, Rest] = S.split(',');
+      Vect3[Length] = Part.trim();
+      S = Rest;
+    }
+  }
+
+  const unsigned Dim = DimC - 'x';
+  assert(Dim >= 0 && Dim < 3 && "Unexpected dim char");
+
+  const uint64_t VInt = mdconst::extract<ConstantInt>(V)->getZExtValue();
+
+  // local variable required for StringRef in Vect3 to point to.
+  const std::string VStr = llvm::utostr(VInt);
+  Vect3[Dim] = VStr;
+  Length = std::max(Length, Dim + 1);
+
+  const std::string NewAttr = llvm::join(ArrayRef(Vect3, Length), ",");
+  F->addFnAttr(Attr, NewAttr);
+}
+
+static inline bool isXYZ(StringRef S) {
+  return S == "x" || S == "y" || S == "z";
+}
+
 bool static upgradeSingleNVVMAnnotation(GlobalValue *GV, StringRef K,
                                        const Metadata *V) {
  if (K == "kernel") {
@ -5059,6 +5099,18 @@ bool static upgradeSingleNVVMAnnotation(GlobalValue *GV, StringRef K,
    cast<Function>(GV)->addFnAttr("nvvm.maxnreg", llvm::utostr(CV));
    return true;
  }
+  if (K.consume_front("maxntid") && isXYZ(K)) {
+    upgradeNVVMFnVectorAttr("nvvm.maxntid", K[0], GV, V);
+    return true;
+  }
+  if (K.consume_front("reqntid") && isXYZ(K)) {
+    upgradeNVVMFnVectorAttr("nvvm.reqntid", K[0], GV, V);
+    return true;
+  }
+  if (K.consume_front("cluster_dim_") && isXYZ(K)) {
+    upgradeNVVMFnVectorAttr("nvvm.cluster_dim", K[0], GV, V);
+    return true;
+  }

  return false;
 }
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@ -35,6 +35,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@ -427,24 +428,15 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
  // If the NVVM IR has some of reqntid* specified, then output
  // the reqntid directive, and set the unspecified ones to 1.
  // If none of Reqntid* is specified, don't output reqntid directive.
-  std::optional<unsigned> Reqntidx = getReqNTIDx(F);
-  std::optional<unsigned> Reqntidy = getReqNTIDy(F);
-  std::optional<unsigned> Reqntidz = getReqNTIDz(F);
+  const auto ReqNTID = getReqNTID(F);
+  if (!ReqNTID.empty())
+    O << formatv(".reqntid {0:$[, ]}\n",
+                 make_range(ReqNTID.begin(), ReqNTID.end()));

-  if (Reqntidx || Reqntidy || Reqntidz)
-    O << ".reqntid " << Reqntidx.value_or(1) << ", " << Reqntidy.value_or(1)
-      << ", " << Reqntidz.value_or(1) << "\n";
-
-  // If the NVVM IR has some of maxntid* specified, then output
-  // the maxntid directive, and set the unspecified ones to 1.
-  // If none of maxntid* is specified, don't output maxntid directive.
-  std::optional<unsigned> Maxntidx = getMaxNTIDx(F);
-  std::optional<unsigned> Maxntidy = getMaxNTIDy(F);
-  std::optional<unsigned> Maxntidz = getMaxNTIDz(F);
-
-  if (Maxntidx || Maxntidy || Maxntidz)
-    O << ".maxntid " << Maxntidx.value_or(1) << ", " << Maxntidy.value_or(1)
-      << ", " << Maxntidz.value_or(1) << "\n";
+  const auto MaxNTID = getMaxNTID(F);
+  if (!MaxNTID.empty())
+    O << formatv(".maxntid {0:$[, ]}\n",
+                 make_range(MaxNTID.begin(), MaxNTID.end()));

  if (const auto Mincta = getMinCTASm(F))
    O << ".minnctapersm " << *Mincta << "\n";
@ -458,21 +450,19 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
  const auto *STI = static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());

  if (STI->getSmVersion() >= 90) {
-    std::optional<unsigned> ClusterX = getClusterDimx(F);
-    std::optional<unsigned> ClusterY = getClusterDimy(F);
-    std::optional<unsigned> ClusterZ = getClusterDimz(F);
+    const auto ClusterDim = getClusterDim(F);

-    if (ClusterX || ClusterY || ClusterZ) {
+    if (!ClusterDim.empty()) {
      O << ".explicitcluster\n";
-      if (ClusterX.value_or(1) != 0) {
-        assert(ClusterY.value_or(1) && ClusterZ.value_or(1) &&
+      if (ClusterDim[0] != 0) {
+        assert(llvm::all_of(ClusterDim, [](unsigned D) { return D != 0; }) &&
               "cluster_dim_x != 0 implies cluster_dim_y and cluster_dim_z "
               "should be non-zero as well");

-        O << ".reqnctapercluster " << ClusterX.value_or(1) << ", "
-          << ClusterY.value_or(1) << ", " << ClusterZ.value_or(1) << "\n";
+        O << formatv(".reqnctapercluster {0:$[, ]}\n",
+                     make_range(ClusterDim.begin(), ClusterDim.end()));
      } else {
-        assert(!ClusterY.value_or(1) && !ClusterZ.value_or(1) &&
+        assert(llvm::all_of(ClusterDim, [](unsigned D) { return D == 0; }) &&
               "cluster_dim_x == 0 implies cluster_dim_y and cluster_dim_z "
               "should be 0 as well");
      }
--- a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp
@ -50,33 +50,10 @@ static std::string getHash(StringRef Str) {
  return llvm::utohexstr(Hash.low(), /*LowerCase=*/true);
 }

-static void addKernelMetadata(Module &M, Function *F) {
-  llvm::LLVMContext &Ctx = M.getContext();
-
-  // Get "nvvm.annotations" metadata node.
-  llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
-
-  // This kernel is only to be called single-threaded.
-  llvm::Metadata *ThreadXMDVals[] = {
-      llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidx"),
-      llvm::ConstantAsMetadata::get(
-          llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
-  llvm::Metadata *ThreadYMDVals[] = {
-      llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidy"),
-      llvm::ConstantAsMetadata::get(
-          llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
-  llvm::Metadata *ThreadZMDVals[] = {
-      llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidz"),
-      llvm::ConstantAsMetadata::get(
-          llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
-
+static void addKernelAttrs(Function *F) {
  F->addFnAttr("nvvm.maxclusterrank", "1");
+  F->addFnAttr("nvvm.maxntid", "1");
  F->setCallingConv(CallingConv::PTX_Kernel);
-
-  // Append metadata to nvvm.annotations.
-  MD->addOperand(llvm::MDNode::get(Ctx, ThreadXMDVals));
-  MD->addOperand(llvm::MDNode::get(Ctx, ThreadYMDVals));
-  MD->addOperand(llvm::MDNode::get(Ctx, ThreadZMDVals));
 }

 static Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) {
@ -88,7 +65,7 @@ static Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) {
  Function *InitOrFiniKernel = Function::createWithDefaultAttr(
      FunctionType::get(Type::getVoidTy(M.getContext()), false),
      GlobalValue::WeakODRLinkage, 0, InitOrFiniKernelName, &M);
-  addKernelMetadata(M, InitOrFiniKernel);
+  addKernelAttrs(InitOrFiniKernel);

  return InitOrFiniKernel;
 }
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@ -575,13 +575,14 @@ unsigned NVPTXTTIImpl::getAssumedAddrSpace(const Value *V) const {
 void NVPTXTTIImpl::collectKernelLaunchBounds(
    const Function &F,
    SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
-  std::optional<unsigned> Val;
-  if ((Val = getMaxClusterRank(F)))
+  if (const auto Val = getMaxClusterRank(F))
    LB.push_back({"maxclusterrank", *Val});
-  if ((Val = getMaxNTIDx(F)))
-    LB.push_back({"maxntidx", *Val});
-  if ((Val = getMaxNTIDy(F)))
-    LB.push_back({"maxntidy", *Val});
-  if ((Val = getMaxNTIDz(F)))
-    LB.push_back({"maxntidz", *Val});
+
+  const auto MaxNTID = getMaxNTID(F);
+  if (MaxNTID.size() > 0)
+    LB.push_back({"maxntidx", MaxNTID[0]});
+  if (MaxNTID.size() > 1)
+    LB.push_back({"maxntidy", MaxNTID[1]});
+  if (MaxNTID.size() > 2)
+    LB.push_back({"maxntidz", MaxNTID[2]});
 }
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@ -13,6 +13,8 @@
 #include "NVPTXUtilities.h"
 #include "NVPTX.h"
 #include "NVPTXTargetMachine.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@ -20,6 +22,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/Mutex.h"
+#include <cstdint>
 #include <cstring>
 #include <map>
 #include <mutex>
@ -196,6 +199,35 @@ static std::optional<unsigned> getFnAttrParsedInt(const Function &F,
             : std::nullopt;
 }

+static SmallVector<unsigned, 3> getFnAttrParsedVector(const Function &F,
+                                                      StringRef Attr) {
+  SmallVector<unsigned, 3> V;
+  auto &Ctx = F.getContext();
+
+  if (F.hasFnAttribute(Attr)) {
+    // We expect the attribute value to be of the form "x[,y[,z]]", where x, y,
+    // and z are unsigned values.
+    StringRef S = F.getFnAttribute(Attr).getValueAsString();
+    for (unsigned I = 0; I < 3 && !S.empty(); I++) {
+      auto [First, Rest] = S.split(",");
+      unsigned IntVal;
+      if (First.trim().getAsInteger(0, IntVal))
+        Ctx.emitError("can't parse integer attribute " + First + " in " + Attr);
+
+      V.push_back(IntVal);
+      S = Rest;
+    }
+  }
+  return V;
+}
+
+static std::optional<uint64_t> getVectorProduct(ArrayRef<unsigned> V) {
+  if (V.empty())
+    return std::nullopt;
+
+  return std::accumulate(V.begin(), V.end(), 1, std::multiplies<uint64_t>{});
+}
+
 bool isParamGridConstant(const Value &V) {
  if (const Argument *Arg = dyn_cast<Argument>(&V)) {
    // "grid_constant" counts argument indices starting from 1
@ -254,71 +286,39 @@ StringRef getSamplerName(const Value &V) {
  return V.getName();
 }

-std::optional<unsigned> getMaxNTIDx(const Function &F) {
-  return findOneNVVMAnnotation(&F, "maxntidx");
+SmallVector<unsigned, 3> getMaxNTID(const Function &F) {
+  return getFnAttrParsedVector(F, "nvvm.maxntid");
 }

-std::optional<unsigned> getMaxNTIDy(const Function &F) {
-  return findOneNVVMAnnotation(&F, "maxntidy");
+SmallVector<unsigned, 3> getReqNTID(const Function &F) {
+  return getFnAttrParsedVector(F, "nvvm.reqntid");
 }

-std::optional<unsigned> getMaxNTIDz(const Function &F) {
-  return findOneNVVMAnnotation(&F, "maxntidz");
+SmallVector<unsigned, 3> getClusterDim(const Function &F) {
+  return getFnAttrParsedVector(F, "nvvm.cluster_dim");
 }

-std::optional<unsigned> getMaxNTID(const Function &F) {
+std::optional<uint64_t> getOverallMaxNTID(const Function &F) {
  // Note: The semantics here are a bit strange. The PTX ISA states the
  // following (11.4.2. Performance-Tuning Directives: .maxntid):
  //
  //  Note that this directive guarantees that the total number of threads does
  //  not exceed the maximum, but does not guarantee that the limit in any
  //  particular dimension is not exceeded.
-  std::optional<unsigned> MaxNTIDx = getMaxNTIDx(F);
-  std::optional<unsigned> MaxNTIDy = getMaxNTIDy(F);
-  std::optional<unsigned> MaxNTIDz = getMaxNTIDz(F);
-  if (MaxNTIDx || MaxNTIDy || MaxNTIDz)
-    return MaxNTIDx.value_or(1) * MaxNTIDy.value_or(1) * MaxNTIDz.value_or(1);
-  return std::nullopt;
+  const auto MaxNTID = getMaxNTID(F);
+  return getVectorProduct(MaxNTID);
 }

-std::optional<unsigned> getClusterDimx(const Function &F) {
-  return findOneNVVMAnnotation(&F, "cluster_dim_x");
-}
-
-std::optional<unsigned> getClusterDimy(const Function &F) {
-  return findOneNVVMAnnotation(&F, "cluster_dim_y");
-}
-
-std::optional<unsigned> getClusterDimz(const Function &F) {
-  return findOneNVVMAnnotation(&F, "cluster_dim_z");
+std::optional<uint64_t> getOverallReqNTID(const Function &F) {
+  // Note: The semantics here are a bit strange. See getMaxNTID.
+  const auto ReqNTID = getReqNTID(F);
+  return getVectorProduct(ReqNTID);
 }

 std::optional<unsigned> getMaxClusterRank(const Function &F) {
  return getFnAttrParsedInt(F, "nvvm.maxclusterrank");
 }

-std::optional<unsigned> getReqNTIDx(const Function &F) {
-  return findOneNVVMAnnotation(&F, "reqntidx");
-}
-
-std::optional<unsigned> getReqNTIDy(const Function &F) {
-  return findOneNVVMAnnotation(&F, "reqntidy");
-}
-
-std::optional<unsigned> getReqNTIDz(const Function &F) {
-  return findOneNVVMAnnotation(&F, "reqntidz");
-}
-
-std::optional<unsigned> getReqNTID(const Function &F) {
-  // Note: The semantics here are a bit strange. See getMaxNTID.
-  std::optional<unsigned> ReqNTIDx = getReqNTIDx(F);
-  std::optional<unsigned> ReqNTIDy = getReqNTIDy(F);
-  std::optional<unsigned> ReqNTIDz = getReqNTIDz(F);
-  if (ReqNTIDx || ReqNTIDy || ReqNTIDz)
-    return ReqNTIDx.value_or(1) * ReqNTIDy.value_or(1) * ReqNTIDz.value_or(1);
-  return std::nullopt;
-}
-
 std::optional<unsigned> getMinCTASm(const Function &F) {
  return getFnAttrParsedInt(F, "nvvm.minctasm");
 }
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@ -14,6 +14,7 @@
 #define LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H

 #include "NVPTX.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/CallingConv.h"
@ -47,19 +48,12 @@ StringRef getTextureName(const Value &);
 StringRef getSurfaceName(const Value &);
 StringRef getSamplerName(const Value &);

-std::optional<unsigned> getMaxNTIDx(const Function &);
-std::optional<unsigned> getMaxNTIDy(const Function &);
-std::optional<unsigned> getMaxNTIDz(const Function &);
-std::optional<unsigned> getMaxNTID(const Function &);
+SmallVector<unsigned, 3> getMaxNTID(const Function &);
+SmallVector<unsigned, 3> getReqNTID(const Function &);
+SmallVector<unsigned, 3> getClusterDim(const Function &);

-std::optional<unsigned> getReqNTIDx(const Function &);
-std::optional<unsigned> getReqNTIDy(const Function &);
-std::optional<unsigned> getReqNTIDz(const Function &);
-std::optional<unsigned> getReqNTID(const Function &);
-
-std::optional<unsigned> getClusterDimx(const Function &);
-std::optional<unsigned> getClusterDimy(const Function &);
-std::optional<unsigned> getClusterDimz(const Function &);
+std::optional<uint64_t> getOverallMaxNTID(const Function &);
+std::optional<uint64_t> getOverallReqNTID(const Function &);

 std::optional<unsigned> getMaxClusterRank(const Function &);
 std::optional<unsigned> getMinCTASm(const Function &);
--- a/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
@ -67,8 +67,8 @@ static bool runNVVMIntrRange(Function &F) {
    unsigned x, y, z;
  } MaxBlockSize, MaxGridSize;

-  const unsigned MetadataNTID = getReqNTID(F).value_or(
-      getMaxNTID(F).value_or(std::numeric_limits<unsigned>::max()));
+  const unsigned MetadataNTID = getOverallReqNTID(F).value_or(
+      getOverallMaxNTID(F).value_or(std::numeric_limits<unsigned>::max()));

  MaxBlockSize.x = std::min(1024u, MetadataNTID);
  MaxBlockSize.y = std::min(1024u, MetadataNTID);
--- a/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll
+++ b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll
@ -24,11 +24,11 @@ attributes #0 = {
  "omp_target_num_teams"="100"
  "omp_target_thread_limit"="101"
  "nvvm.maxclusterrank"="200"
+  "nvvm.maxntid"="210,211,212"
 }

 !llvm.module.flags = !{!0}
 !llvm.dbg.cu = !{!1}
-!nvvm.annotations = !{!7, !8, !9, !10}

 !0 = !{i32 2, !"Debug Info Version", i32 3}
 !1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
@ -36,7 +36,3 @@ attributes #0 = {
 !3 = !{}
 !4 = !DISubroutineType(types: !3)
 !5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3)
-!7 = !{ptr @test, !"maxntidx", i32 210}
-!8 = !{ptr @test, !"maxntidy", i32 211}
-!9 = !{ptr @test, !"maxntidz", i32 212}
-!10 = distinct !{ptr null, !"kernel", i32 1}
--- a/llvm/test/CodeGen/NVPTX/annotations.ll
+++ b/llvm/test/CodeGen/NVPTX/annotations.ll
@ -9,14 +9,14 @@
 ; CHECK: .global .surfref surface

 ; CHECK: .entry kernel_func_maxntid
-define void @kernel_func_maxntid(ptr %a) {
+define ptx_kernel void @kernel_func_maxntid(ptr %a) "nvvm.maxntid"="10,20,30" {
 ; CHECK: .maxntid 10, 20, 30
 ; CHECK: ret
  ret void
 }

 ; CHECK: .entry kernel_func_reqntid
-define void @kernel_func_reqntid(ptr %a) {
+define ptx_kernel void @kernel_func_reqntid(ptr %a) "nvvm.reqntid"="11,22,33" {
 ; CHECK: .reqntid 11, 22, 33
 ; CHECK: ret
  ret void
@ -36,13 +36,7 @@ define ptx_kernel void @kernel_func_maxnreg() "nvvm.maxnreg"="1234" {
  ret void
 }

-!nvvm.annotations = !{!1, !2, !3, !4, !9, !10}
-
-!1 = !{ptr @kernel_func_maxntid, !"kernel", i32 1}
-!2 = !{ptr @kernel_func_maxntid, !"maxntidx", i32 10, !"maxntidy", i32 20, !"maxntidz", i32 30}
-
-!3 = !{ptr @kernel_func_reqntid, !"kernel", i32 1}
-!4 = !{ptr @kernel_func_reqntid, !"reqntidx", i32 11, !"reqntidy", i32 22, !"reqntidz", i32 33}
+!nvvm.annotations = !{!9, !10}

 !9 = !{ptr addrspace(1) @texture, !"texture", i32 1}
 !10 = !{ptr addrspace(1) @surface, !"surface", i32 1}
--- a/llvm/test/CodeGen/NVPTX/bug26185-2.ll
+++ b/llvm/test/CodeGen/NVPTX/bug26185-2.ll
@ -26,8 +26,4 @@ bb:
  ret void
 }

-attributes #0 = { norecurse nounwind "polly.skip.fn" }
-
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @spam, !"maxntidx", i64 1, !"maxntidy", i64 1, !"maxntidz", i64 1}
+attributes #0 = { norecurse nounwind "polly.skip.fn" "nvvm.maxntid"="1,1,1" }
--- a/llvm/test/CodeGen/NVPTX/cluster-dim.ll
+++ b/llvm/test/CodeGen/NVPTX/cluster-dim.ll
@ -3,7 +3,7 @@
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 | FileCheck -check-prefixes=CHECK90 %s
 ; RUN: %if ptxas-12.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 | %ptxas-verify -arch=sm_90 %}

-define ptx_kernel void @kernel_func_clusterxyz() {
+define ptx_kernel void @kernel_func_clusterxyz() "nvvm.cluster_dim"="3,5,7" {
 ; CHECK80-LABEL: kernel_func_clusterxyz(
 ; CHECK80:       {
 ; CHECK80-EMPTY:
@ -21,8 +21,3 @@ define ptx_kernel void @kernel_func_clusterxyz() {
 ; CHECK90-NEXT:    ret;
  ret void
 }
-
-
-!nvvm.annotations = !{!1}
-
-!1 = !{ptr @kernel_func_clusterxyz, !"cluster_dim_x", i32 3, !"cluster_dim_y", i32 5, !"cluster_dim_z", i32 7}
--- a/llvm/test/CodeGen/NVPTX/intr-range.ll
+++ b/llvm/test/CodeGen/NVPTX/intr-range.ll
@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --version 5
 ; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -mcpu=sm_20 -passes=nvvm-intr-range | FileCheck %s

-define ptx_kernel i32 @test_maxntid() {
+define ptx_kernel i32 @test_maxntid() "nvvm.maxntid"="32,1,3" {
 ; CHECK-LABEL: define ptx_kernel i32 @test_maxntid(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 96) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
@ -31,9 +31,9 @@ define ptx_kernel i32 @test_maxntid() {
  ret i32 %11
 }

-define ptx_kernel i32 @test_reqntid() {
+define ptx_kernel i32 @test_reqntid() "nvvm.reqntid"="20" {
 ; CHECK-LABEL: define ptx_kernel i32 @test_reqntid(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ; CHECK-NEXT:    [[TMP5:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.z()
@ -64,9 +64,9 @@ define ptx_kernel i32 @test_reqntid() {
 ;; A case like this could occur if a function with the sreg intrinsic was
 ;; inlined into a kernel where the tid metadata is present, ensure the range is
 ;; updated.
-define ptx_kernel i32 @test_inlined() {
+define ptx_kernel i32 @test_inlined() "nvvm.maxntid"="4" {
 ; CHECK-LABEL: define ptx_kernel i32 @test_inlined(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 4) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
@ -81,8 +81,3 @@ declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
 declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
 declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
-
-!nvvm.annotations = !{!0, !1, !2}
-!0 = !{ptr @test_maxntid, !"maxntidx", i32 32, !"maxntidz", i32 3}
-!1 = !{ptr @test_reqntid, !"reqntidx", i32 20}
-!2 = !{ptr @test_inlined, !"maxntidx", i32 4}
--- a/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll
@ -84,11 +84,4 @@ define internal void @bar() {
 ; CHECK:       while.end:
 ; CHECK-NEXT:    ret void

-; CHECK: attributes #[[ATTR0]] = { "nvvm.maxclusterrank"="1" }
-
-; CHECK: [[META1:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidx", i32 1}
-; CHECK: [[META2:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidy", i32 1}
-; CHECK: [[META3:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidz", i32 1}
-; CHECK: [[META4:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidx", i32 1}
-; CHECK: [[META5:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidy", i32 1}
-; CHECK: [[META6:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidz", i32 1}
+; CHECK: attributes #[[ATTR0]] = { "nvvm.maxclusterrank"="1" "nvvm.maxntid"="1" }
--- a/llvm/test/CodeGen/NVPTX/maxclusterrank.ll
+++ b/llvm/test/CodeGen/NVPTX/maxclusterrank.ll
@ -4,20 +4,16 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx64-unknown-unknown"

-; CHECK: .maxntid 128, 1, 1
+; CHECK: .maxntid 128
 ; CHECK: .minnctapersm 2
 ; CHECK_SM_90: .maxclusterrank 8
 ; CHECK_SM_80-NOT: .maxclusterrank 8

 ; Make sure that for SM version prior to 90 `.maxclusterrank` directive is
 ; silently ignored.
-define dso_local ptx_kernel void @_Z18TestMaxClusterRankv() "nvvm.minctasm"="2" "nvvm.maxclusterrank"="8" {
+define dso_local ptx_kernel void @_Z18TestMaxClusterRankv() "nvvm.minctasm"="2" "nvvm.maxclusterrank"="8" "nvvm.maxntid"="128" {
 entry:
  %a = alloca i32, align 4
  store volatile i32 1, ptr %a, align 4
  ret void
 }
-
-!nvvm.annotations = !{!1}
-
-!1 = !{ptr @_Z18TestMaxClusterRankv, !"maxntidx", i32 128}
--- a/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll
+++ b/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll
@ -48,7 +48,55 @@ define void @test_maxnreg() {
  ret void
 }

-!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6}
+define void @test_maxntid_1() {
+; CHECK-LABEL: define void @test_maxntid_1(
+; CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define void @test_maxntid_2() {
+; CHECK-LABEL: define void @test_maxntid_2(
+; CHECK-SAME: ) #[[ATTR5:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define void @test_maxntid_3() {
+; CHECK-LABEL: define void @test_maxntid_3(
+; CHECK-SAME: ) #[[ATTR6:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define void @test_maxntid_4() {
+; CHECK-LABEL: define void @test_maxntid_4(
+; CHECK-SAME: ) #[[ATTR7:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define void @test_reqntid() {
+; CHECK-LABEL: define void @test_reqntid(
+; CHECK-SAME: ) #[[ATTR8:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define void @test_cluster_dim() {
+; CHECK-LABEL: define void @test_cluster_dim(
+; CHECK-SAME: ) #[[ATTR9:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12}

 !0 = !{ptr @test_align, !"align", i32 u0x00000008, !"align", i32 u0x00010008, !"align", i32 u0x00020010}
 !1 = !{null, !"align", i32 u0x00000008, !"align", i32 u0x00010008, !"align", i32 u0x00020008}
@ -57,12 +105,24 @@ define void @test_maxnreg() {
 !4 = !{ptr @test_cluster_max_blocks, !"cluster_max_blocks", i32 3}
 !5 = !{ptr @test_minctasm, !"minctasm", i32 4}
 !6 = !{ptr @test_maxnreg, !"maxnreg", i32 5}
+!7 = !{ptr @test_maxntid_1, !"maxntidx", i32 50}
+!8 = !{ptr @test_maxntid_2, !"maxntidx", i32 11, !"maxntidy", i32 22, !"maxntidz", i32 33}
+!9 = !{ptr @test_maxntid_3, !"maxntidz", i32 11, !"maxntidy", i32 22, !"maxntidx", i32 33}
+!10 = !{ptr @test_maxntid_4, !"maxntidz", i32 100}
+!11 = !{ptr @test_reqntid, !"reqntidx", i32 31, !"reqntidy", i32 32, !"reqntidz", i32 33}
+!12 = !{ptr @test_cluster_dim, !"cluster_dim_x", i32 101, !"cluster_dim_y", i32 102, !"cluster_dim_z", i32 103}

 ;.
 ; CHECK: attributes #[[ATTR0]] = { "nvvm.maxclusterrank"="2" }
 ; CHECK: attributes #[[ATTR1]] = { "nvvm.maxclusterrank"="3" }
 ; CHECK: attributes #[[ATTR2]] = { "nvvm.minctasm"="4" }
 ; CHECK: attributes #[[ATTR3]] = { "nvvm.maxnreg"="5" }
+; CHECK: attributes #[[ATTR4]] = { "nvvm.maxntid"="50" }
+; CHECK: attributes #[[ATTR5]] = { "nvvm.maxntid"="11,22,33" }
+; CHECK: attributes #[[ATTR6]] = { "nvvm.maxntid"="33,22,11" }
+; CHECK: attributes #[[ATTR7]] = { "nvvm.maxntid"="1,1,100" }
+; CHECK: attributes #[[ATTR8]] = { "nvvm.reqntid"="31,32,33" }
+; CHECK: attributes #[[ATTR9]] = { "nvvm.cluster_dim"="101,102,103" }
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{ptr @test_align, !"align", i32 8}
 ;.
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
@ -18,8 +18,10 @@
 #include "mlir/Target/LLVMIR/ModuleTranslation.h"

 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/Support/FormatVariadic.h"

 using namespace mlir;
 using namespace mlir::LLVM;
@ -195,48 +197,33 @@ public:
    auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
    if (!func)
      return failure();
-    llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext();
    llvm::Function *llvmFunc = moduleTranslation.lookupFunction(func.getName());

-    auto generateMetadata = [&](int dim, StringRef name) {
-      llvm::Metadata *llvmMetadata[] = {
-          llvm::ValueAsMetadata::get(llvmFunc),
-          llvm::MDString::get(llvmContext, name),
-          llvm::ValueAsMetadata::get(llvm::ConstantInt::get(
-              llvm::Type::getInt32Ty(llvmContext), dim))};
-      llvm::MDNode *llvmMetadataNode =
-          llvm::MDNode::get(llvmContext, llvmMetadata);
-      moduleTranslation.getOrInsertNamedModuleMetadata("nvvm.annotations")
-          ->addOperand(llvmMetadataNode);
-    };
    if (attribute.getName() == NVVM::NVVMDialect::getMaxntidAttrName()) {
      if (!dyn_cast<DenseI32ArrayAttr>(attribute.getValue()))
        return failure();
      auto values = cast<DenseI32ArrayAttr>(attribute.getValue());
-      generateMetadata(values[0], NVVM::NVVMDialect::getMaxntidXName());
-      if (values.size() > 1)
-        generateMetadata(values[1], NVVM::NVVMDialect::getMaxntidYName());
-      if (values.size() > 2)
-        generateMetadata(values[2], NVVM::NVVMDialect::getMaxntidZName());
+      const std::string attr = llvm::formatv(
+          "{0:$[,]}", llvm::make_range(values.asArrayRef().begin(),
+                                       values.asArrayRef().end()));
+      llvmFunc->addFnAttr("nvvm.maxntid", attr);
    } else if (attribute.getName() == NVVM::NVVMDialect::getReqntidAttrName()) {
      if (!dyn_cast<DenseI32ArrayAttr>(attribute.getValue()))
        return failure();
      auto values = cast<DenseI32ArrayAttr>(attribute.getValue());
-      generateMetadata(values[0], NVVM::NVVMDialect::getReqntidXName());
-      if (values.size() > 1)
-        generateMetadata(values[1], NVVM::NVVMDialect::getReqntidYName());
-      if (values.size() > 2)
-        generateMetadata(values[2], NVVM::NVVMDialect::getReqntidZName());
+      const std::string attr = llvm::formatv(
+          "{0:$[,]}", llvm::make_range(values.asArrayRef().begin(),
+                                       values.asArrayRef().end()));
+      llvmFunc->addFnAttr("nvvm.reqntid", attr);
    } else if (attribute.getName() ==
               NVVM::NVVMDialect::getClusterDimAttrName()) {
      if (!dyn_cast<DenseI32ArrayAttr>(attribute.getValue()))
        return failure();
      auto values = cast<DenseI32ArrayAttr>(attribute.getValue());
-      generateMetadata(values[0], NVVM::NVVMDialect::getClusterDimXName());
-      if (values.size() > 1)
-        generateMetadata(values[1], NVVM::NVVMDialect::getClusterDimYName());
-      if (values.size() > 2)
-        generateMetadata(values[2], NVVM::NVVMDialect::getClusterDimZName());
+      const std::string attr = llvm::formatv(
+          "{0:$[,]}", llvm::make_range(values.asArrayRef().begin(),
+                                       values.asArrayRef().end()));
+      llvmFunc->addFnAttr("nvvm.cluster_dim", attr);
    } else if (attribute.getName() ==
               NVVM::NVVMDialect::getClusterMaxBlocksAttrName()) {
      auto value = dyn_cast<IntegerAttr>(attribute.getValue());
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@ -590,33 +590,24 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.maxntid = array<i32: 1, 2
  llvm.return
 }

-// CHECK: define ptx_kernel void @kernel_func
-// CHECK:     !nvvm.annotations =
-// CHECK:     {ptr @kernel_func, !"maxntidx", i32 1}
-// CHECK:     {ptr @kernel_func, !"maxntidy", i32 23}
-// CHECK:     {ptr @kernel_func, !"maxntidz", i32 32}
+// CHECK: define ptx_kernel void @kernel_func() #[[ATTR0:[0-9]+]]
+// CHECK: attributes #[[ATTR0]] = { "nvvm.maxntid"="1,23,32" }
 // -----

 llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.reqntid = array<i32: 1, 23, 32>} {
  llvm.return
 }

-// CHECK: define ptx_kernel void @kernel_func
-// CHECK:     !nvvm.annotations =
-// CHECK:     {ptr @kernel_func, !"reqntidx", i32 1}
-// CHECK:     {ptr @kernel_func, !"reqntidy", i32 23}
-// CHECK:     {ptr @kernel_func, !"reqntidz", i32 32}
+// CHECK: define ptx_kernel void @kernel_func() #[[ATTR0:[0-9]+]]
+// CHECK: attributes #[[ATTR0]] = { "nvvm.reqntid"="1,23,32" }
 // -----

 llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.cluster_dim = array<i32: 3, 5, 7>} {
  llvm.return
 }

-// CHECK: define ptx_kernel void @kernel_func
-// CHECK:     !nvvm.annotations =
-// CHECK:     {ptr @kernel_func, !"cluster_dim_x", i32 3}
-// CHECK:     {ptr @kernel_func, !"cluster_dim_y", i32 5}
-// CHECK:     {ptr @kernel_func, !"cluster_dim_z", i32 7}
+// CHECK: define ptx_kernel void @kernel_func() #[[ATTR0:[0-9]+]]
+// CHECK: attributes #[[ATTR0]] = { "nvvm.cluster_dim"="3,5,7" }
 // -----

 llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.cluster_max_blocks = 8} {
@ -650,11 +641,7 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.maxntid = array<i32: 1, 2
 }

 // CHECK: define ptx_kernel void @kernel_func() #[[ATTR0:[0-9]+]]
-// CHECK: attributes #[[ATTR0]] = { "nvvm.maxnreg"="32" "nvvm.minctasm"="16" }
-// CHECK:     !nvvm.annotations =
-// CHECK:     {ptr @kernel_func, !"maxntidx", i32 1}
-// CHECK:     {ptr @kernel_func, !"maxntidy", i32 23}
-// CHECK:     {ptr @kernel_func, !"maxntidz", i32 32}
+// CHECK: attributes #[[ATTR0]] = { "nvvm.maxnreg"="32" "nvvm.maxntid"="1,23,32" "nvvm.minctasm"="16" }

 // -----
 // CHECK: define ptx_kernel void @kernel_func