[NVPTX] Convert vector function nvvm.annotations to attributes (#127736)

Replace some more nvvm.annotations with function attributes,
auto-upgrading the annotations as needed. These new attributes will be
more idiomatic and compile-time efficient than the annotations.

- !"maxntid[xyz]" -> "nvvm.maxntid"
- !"reqntid[xyz]" -> "nvvm.reqntid"
- !"cluster_dim_[xyz]" -> "nvvm.cluster_dim"
This commit is contained in:
Alex MacLean 2025-02-26 08:45:27 -08:00 committed by GitHub
parent 14da7d5c1f
commit 6c2e170d04
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
23 changed files with 303 additions and 314 deletions

View File

@ -357,17 +357,13 @@ void CodeGenModule::handleCUDALaunchBoundsAttr(llvm::Function *F,
int32_t *MaxThreadsVal,
int32_t *MinBlocksVal,
int32_t *MaxClusterRankVal) {
// Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
llvm::APSInt MaxThreads(32);
MaxThreads = Attr->getMaxThreads()->EvaluateKnownConstInt(getContext());
if (MaxThreads > 0) {
if (MaxThreadsVal)
*MaxThreadsVal = MaxThreads.getExtValue();
if (F) {
// Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx",
MaxThreads.getExtValue());
}
if (F)
F->addFnAttr("nvvm.maxntid", llvm::utostr(MaxThreads.getExtValue()));
}
// min and max blocks is an optional argument for CUDALaunchBoundsAttr. If it

View File

@ -10,23 +10,30 @@
#endif
// CHECK: @Kernel1() #[[ATTR0:[0-9]+]]
// CHECK: @Kernel2() #[[ATTR1:[0-9]+]]
// CHECK: @{{.*}}Kernel3{{.*}}() #[[ATTR1]]
// CHECK: @{{.*}}Kernel4{{.*}}() #[[ATTR0]]
// CHECK: @{{.*}}Kernel5{{.*}}() #[[ATTR1:[0-9]+]]
// CHECK: @{{.*}}Kernel6{{.*}}() #[[ATTR0]]
// CHECK: @{{.*}}Kernel8{{.*}}() #[[ATTR3:[0-9]+]]
// CHECK: @{{.*}}Kernel5{{.*}}() #[[ATTR2:[0-9]+]]
// CHECK: @{{.*}}Kernel6{{.*}}() #[[ATTR3:[0-9]+]]
// CHECK: @{{.*}}Kernel7{{.*}}() #[[ATTR1]]
// CHECK: @{{.*}}Kernel8{{.*}}() #[[ATTR4:[0-9]+]]
// CHECK: attributes #[[ATTR0]] = {{{.*}} "nvvm.minctasm"="2" {{.*}}}
// CHECK: attributes #[[ATTR1]] = {{{.*}} "nvvm.minctasm"="258" {{.*}}}
// CHECK: attributes #[[ATTR3]] = {{{.*}} "nvvm.minctasm"="12" {{.*}}}
// CHECK-DAG: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxntid"="256" "nvvm.minctasm"="2" {{.*}}}
// CHECK-DAG: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxntid"="256" {{.*}}}
// CHECK-DAG: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="356" "nvvm.minctasm"="258" {{.*}}}
// CHECK-DAG: attributes #[[ATTR3]] = {{{.*}} "nvvm.minctasm"="2" {{.*}}}
// CHECK-DAG: attributes #[[ATTR4]] = {{{.*}} "nvvm.maxntid"="100" "nvvm.minctasm"="12" {{.*}}}
// CHECK_MAX_BLOCKS: @Kernel1_sm_90() #[[ATTR4:[0-9]+]]
// CHECK_MAX_BLOCKS: @{{.*}}Kernel4_sm_90{{.*}} #[[ATTR4]]
// CHECK_MAX_BLOCKS: @{{.*}}Kernel5_sm_90{{.*}} #[[ATTR5:[0-9]+]]
// CHECK_MAX_BLOCKS: @{{.*}}Kernel8_sm_90{{.*}} #[[ATTR6:[0-9]+]]
// CHECK_MAX_BLOCKS: @Kernel1_sm_90() #[[ATTR0:[0-9]+]]
// CHECK_MAX_BLOCKS: @{{.*}}Kernel4_sm_90{{.*}} #[[ATTR0]]
// CHECK_MAX_BLOCKS: @{{.*}}Kernel5_sm_90{{.*}} #[[ATTR1:[0-9]+]]
// CHECK_MAX_BLOCKS: @{{.*}}Kernel7_sm_90{{.*}} #[[ATTR2:[0-9]+]]
// CHECK_MAX_BLOCKS: @{{.*}}Kernel8_sm_90{{.*}} #[[ATTR3:[0-9]+]]
// CHECK_MAX_BLOCKS: attributes #[[ATTR4]] = {{{.*}} "nvvm.maxclusterrank"="4" "nvvm.minctasm"="2" {{.*}}}
// CHECK_MAX_BLOCKS: attributes #[[ATTR5]] = {{{.*}} "nvvm.maxclusterrank"="260" "nvvm.minctasm"="258" {{.*}}}
// CHECK_MAX_BLOCKS: attributes #[[ATTR6]] = {{{.*}} "nvvm.maxclusterrank"="14" "nvvm.minctasm"="12" {{.*}}}
// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxclusterrank"="4" "nvvm.maxntid"="256" "nvvm.minctasm"="2" {{.*}}}
// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxclusterrank"="260" "nvvm.maxntid"="356" "nvvm.minctasm"="258" {{.*}}}
// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="256" {{.*}}}
// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR3]] = {{{.*}} "nvvm.maxclusterrank"="14" "nvvm.maxntid"="100" "nvvm.minctasm"="12" {{.*}}}
// Test both max threads per block and Min cta per sm.
extern "C" {
@ -37,8 +44,6 @@ Kernel1()
}
}
// CHECK: !{{[0-9]+}} = !{ptr @Kernel1, !"maxntidx", i32 256}
#ifdef USE_MAX_BLOCKS
// Test max threads per block and min/max cta per sm.
extern "C" {
@ -48,8 +53,6 @@ Kernel1_sm_90()
{
}
}
// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"maxntidx", i32 256}
#endif // USE_MAX_BLOCKS
// Test only max threads per block. Min cta per sm defaults to 0, and
@ -62,8 +65,6 @@ Kernel2()
}
}
// CHECK: !{{[0-9]+}} = !{ptr @Kernel2, !"maxntidx", i32 256}
template <int max_threads_per_block>
__global__ void
__launch_bounds__(max_threads_per_block)
@ -72,7 +73,6 @@ Kernel3()
}
template __global__ void Kernel3<MAX_THREADS_PER_BLOCK>();
// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel3{{.*}}, !"maxntidx", i32 256}
template <int max_threads_per_block, int min_blocks_per_mp>
__global__ void
@ -82,7 +82,6 @@ Kernel4()
}
template __global__ void Kernel4<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4{{.*}}, !"maxntidx", i32 256}
#ifdef USE_MAX_BLOCKS
template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp>
@ -93,7 +92,6 @@ Kernel4_sm_90()
}
template __global__ void Kernel4_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();
// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"maxntidx", i32 256}
#endif //USE_MAX_BLOCKS
const int constint = 100;
@ -106,8 +104,6 @@ Kernel5()
}
template __global__ void Kernel5<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5{{.*}}, !"maxntidx", i32 356}
#ifdef USE_MAX_BLOCKS
template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp>
@ -120,7 +116,6 @@ Kernel5_sm_90()
}
template __global__ void Kernel5_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();
// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"maxntidx", i32 356}
#endif //USE_MAX_BLOCKS
// Make sure we don't emit negative launch bounds values.
@ -129,15 +124,12 @@ __launch_bounds__( -MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP )
Kernel6()
{
}
// CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel6{{.*}}, !"maxntidx",
__global__ void
__launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP )
Kernel7()
{
}
// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, !"maxntidx",
// CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, !"minctasm",
#ifdef USE_MAX_BLOCKS
__global__ void
@ -145,17 +137,12 @@ __launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP, -MAX_BLOCKS_PER_MP
Kernel7_sm_90()
{
}
// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"maxntidx",
// CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"minctasm",
// CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"maxclusterrank",
#endif // USE_MAX_BLOCKS
const char constchar = 12;
__global__ void __launch_bounds__(constint, constchar) Kernel8() {}
// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel8{{.*}}, !"maxntidx", i32 100
#ifdef USE_MAX_BLOCKS
const char constchar_2 = 14;
__global__ void __launch_bounds__(constint, constchar, constchar_2) Kernel8_sm_90() {}
// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"maxntidx", i32 100
#endif // USE_MAX_BLOCKS

View File

@ -11,9 +11,13 @@
// Check that the target attributes are set on the generated kernel
void func() {
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l18(ptr {{[^,]+}}) #0
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l20(ptr {{[^,]+}})
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #4
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #0
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l24(ptr {{[^,]+}})
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l26(ptr {{[^,]+}}) #4
// NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #[[ATTR0:[0-9]+]]
// NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l24(ptr {{[^,]+}}) #[[ATTR1:[0-9]+]]
// NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l26(ptr {{[^,]+}}) #[[ATTR2:[0-9]+]]
#pragma omp target ompx_attribute([[clang::amdgpu_flat_work_group_size(10, 20)]])
{}
@ -34,9 +38,12 @@ void func() {
// AMD-SAME: "omp_target_thread_limit"="17"
// It is unclear if we should use the AMD annotations for other targets, we do for now.
// NVIDIA: "omp_target_thread_limit"="20"
// NVIDIA: "omp_target_thread_limit"="45"
// NVIDIA: "omp_target_thread_limit"="17"
// NVIDIA: !{ptr @__omp_offloading[[HASH1:.*]]_l18, !"maxntidx", i32 20}
// NVIDIA: !{ptr @__omp_offloading[[HASH2:.*]]_l20, !"maxntidx", i32 45}
// NVIDIA: !{ptr @__omp_offloading[[HASH3:.*]]_l22, !"maxntidx", i32 17}
// NVIDIA: attributes #[[ATTR0]]
// NVIDIA-SAME: "nvvm.maxntid"="20"
// NVIDIA-SAME: "omp_target_thread_limit"="20"
// NVIDIA: attributes #[[ATTR1]]
// NVIDIA-SAME: "nvvm.maxntid"="45"
// NVIDIA-SAME: "omp_target_thread_limit"="45"
// NVIDIA: attributes #[[ATTR2]]
// NVIDIA-SAME: "nvvm.maxntid"="17"
// NVIDIA-SAME: "omp_target_thread_limit"="17"

View File

@ -7,23 +7,21 @@
#define HEADER
void foo(int N) {
// CHECK: l11, !"maxntidx", i32 128}
// CHECK: define {{.*}}l11{{.*}} #[[ATTR0:[0-9]+]]
#pragma omp target teams distribute parallel for simd
for (int i = 0; i < N; ++i)
;
// CHECK: l15, !"maxntidx", i32 4}
// CHECK: define {{.*}}l15{{.*}} #[[ATTR1:[0-9]+]]
#pragma omp target teams distribute parallel for simd thread_limit(4)
for (int i = 0; i < N; ++i)
;
// CHECK-NOT: l21, !"maxntidx", i32 128}
// CHECK: l21, !"maxntidx", i32 42}
// CHECK-NOT: l21, !"maxntidx", i32 128}
// CHECK: define {{.*}}l20{{.*}} #[[ATTR2:[0-9]+]]
#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42))))
for (int i = 0; i < N; ++i)
;
// CHECK-NOT: l27, !"maxntidx", i32 42}
// CHECK: l27, !"maxntidx", i32 22}
// CHECK-NOT: l27, !"maxntidx", i32 42}
// CHECK: define {{.*}}l25{{.*}} #[[ATTR3:[0-9]+]]
#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) num_threads(22)
for (int i = 0; i < N; ++i)
;
@ -31,3 +29,7 @@ void foo(int N) {
#endif
// CHECK: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxntid"="128" {{.*}}}
// CHECK: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxntid"="4" {{.*}}}
// CHECK: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="42" {{.*}}}
// CHECK: attributes #[[ATTR3]] = {{{.*}} "nvvm.maxntid"="22" {{.*}}}

View File

@ -74,6 +74,23 @@ Function Attributes
This attribute indicates the maximum number of registers to be used for the
kernel function.
``"nvvm.maxntid"="<x>[,<y>[,<z>]]"``
This attribute declares the maximum number of threads in the thread block
(CTA). The maximum number of threads is the product of the maximum extent in
each dimension. Exceeding the maximum number of threads results in a runtime
error or kernel launch failure.
``"nvvm.reqntid"="<x>[,<y>[,<z>]]"``
This attribute declares the exact number of threads in the thread block
(CTA). The number of threads is the product of the value in each dimension.
Specifying a different CTA dimension at launch will result in a runtime
error or kernel launch failure.
``"nvvm.cluster_dim"="<x>[,<y>[,<z>]]"``
This attribute declares the number of thread blocks (CTAs) in the cluster.
The total number of CTAs is the product of the number of CTAs in each
dimension. Specifying a different cluster dimension at launch will result in
a runtime error or kernel launch failure. Only supported for Hopper+.
.. _address_spaces:

View File

@ -6406,45 +6406,13 @@ void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
KernelEnvironmentGV->setInitializer(NewInitializer);
}
static MDNode *getNVPTXMDNode(Function &Kernel, StringRef Name) {
Module &M = *Kernel.getParent();
NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
for (auto *Op : MD->operands()) {
if (Op->getNumOperands() != 3)
continue;
auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
if (!KernelOp || KernelOp->getValue() != &Kernel)
continue;
auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
if (!Prop || Prop->getString() != Name)
continue;
return Op;
}
return nullptr;
}
static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value,
bool Min) {
// Update the "maxntidx" metadata for NVIDIA, or add it.
MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name);
if (ExistingOp) {
auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
ExistingOp->replaceOperandWith(
2, ConstantAsMetadata::get(ConstantInt::get(
OldVal->getValue()->getType(),
Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value))));
} else {
LLVMContext &Ctx = Kernel.getContext();
Metadata *MDVals[] = {ConstantAsMetadata::get(&Kernel),
MDString::get(Ctx, Name),
ConstantAsMetadata::get(
ConstantInt::get(Type::getInt32Ty(Ctx), Value))};
// Append metadata to nvvm.annotations
Module &M = *Kernel.getParent();
NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
MD->addOperand(MDNode::get(Ctx, MDVals));
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
bool Min) {
if (Kernel.hasFnAttribute(Name)) {
int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
}
Kernel.addFnAttr(Name, llvm::utostr(Value));
}
std::pair<int32_t, int32_t>
@ -6466,9 +6434,8 @@ OpenMPIRBuilder::readThreadBoundsForKernel(const Triple &T, Function &Kernel) {
return {LB, UB};
}
if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) {
auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
if (Kernel.hasFnAttribute("nvvm.maxntid")) {
int32_t UB = Kernel.getFnAttributeAsParsedInteger("nvvm.maxntid");
return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
}
return {0, ThreadLimit};
@ -6485,7 +6452,7 @@ void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple &T,
return;
}
updateNVPTXMetadata(Kernel, "maxntidx", UB, true);
updateNVPTXAttr(Kernel, "nvvm.maxntid", UB, true);
}
std::pair<int32_t, int32_t>

View File

@ -13,11 +13,13 @@
//===----------------------------------------------------------------------===//
#include "llvm/IR/AutoUpgrade.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/BinaryFormat/Dwarf.h"
#include "llvm/IR/AttributeMask.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DebugInfo.h"
@ -46,6 +48,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Regex.h"
#include "llvm/TargetParser/Triple.h"
#include <cstdint>
#include <cstring>
#include <numeric>
@ -5021,6 +5024,43 @@ bool llvm::UpgradeDebugInfo(Module &M) {
return Modified;
}
static void upgradeNVVMFnVectorAttr(const StringRef Attr, const char DimC,
GlobalValue *GV, const Metadata *V) {
Function *F = cast<Function>(GV);
constexpr StringLiteral DefaultValue = "1";
StringRef Vect3[3] = {DefaultValue, DefaultValue, DefaultValue};
unsigned Length = 0;
if (F->hasFnAttribute(Attr)) {
// We expect the existing attribute to have the form "x[,y[,z]]". Here we
// parse these elements placing them into Vect3
StringRef S = F->getFnAttribute(Attr).getValueAsString();
for (; Length < 3 && !S.empty(); Length++) {
auto [Part, Rest] = S.split(',');
Vect3[Length] = Part.trim();
S = Rest;
}
}
const unsigned Dim = DimC - 'x';
assert(Dim >= 0 && Dim < 3 && "Unexpected dim char");
const uint64_t VInt = mdconst::extract<ConstantInt>(V)->getZExtValue();
// local variable required for StringRef in Vect3 to point to.
const std::string VStr = llvm::utostr(VInt);
Vect3[Dim] = VStr;
Length = std::max(Length, Dim + 1);
const std::string NewAttr = llvm::join(ArrayRef(Vect3, Length), ",");
F->addFnAttr(Attr, NewAttr);
}
static inline bool isXYZ(StringRef S) {
return S == "x" || S == "y" || S == "z";
}
bool static upgradeSingleNVVMAnnotation(GlobalValue *GV, StringRef K,
const Metadata *V) {
if (K == "kernel") {
@ -5059,6 +5099,18 @@ bool static upgradeSingleNVVMAnnotation(GlobalValue *GV, StringRef K,
cast<Function>(GV)->addFnAttr("nvvm.maxnreg", llvm::utostr(CV));
return true;
}
if (K.consume_front("maxntid") && isXYZ(K)) {
upgradeNVVMFnVectorAttr("nvvm.maxntid", K[0], GV, V);
return true;
}
if (K.consume_front("reqntid") && isXYZ(K)) {
upgradeNVVMFnVectorAttr("nvvm.reqntid", K[0], GV, V);
return true;
}
if (K.consume_front("cluster_dim_") && isXYZ(K)) {
upgradeNVVMFnVectorAttr("nvvm.cluster_dim", K[0], GV, V);
return true;
}
return false;
}

View File

@ -35,6 +35,7 @@
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
@ -427,24 +428,15 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
// If the NVVM IR has some of reqntid* specified, then output
// the reqntid directive, and set the unspecified ones to 1.
// If none of Reqntid* is specified, don't output reqntid directive.
std::optional<unsigned> Reqntidx = getReqNTIDx(F);
std::optional<unsigned> Reqntidy = getReqNTIDy(F);
std::optional<unsigned> Reqntidz = getReqNTIDz(F);
const auto ReqNTID = getReqNTID(F);
if (!ReqNTID.empty())
O << formatv(".reqntid {0:$[, ]}\n",
make_range(ReqNTID.begin(), ReqNTID.end()));
if (Reqntidx || Reqntidy || Reqntidz)
O << ".reqntid " << Reqntidx.value_or(1) << ", " << Reqntidy.value_or(1)
<< ", " << Reqntidz.value_or(1) << "\n";
// If the NVVM IR has some of maxntid* specified, then output
// the maxntid directive, and set the unspecified ones to 1.
// If none of maxntid* is specified, don't output maxntid directive.
std::optional<unsigned> Maxntidx = getMaxNTIDx(F);
std::optional<unsigned> Maxntidy = getMaxNTIDy(F);
std::optional<unsigned> Maxntidz = getMaxNTIDz(F);
if (Maxntidx || Maxntidy || Maxntidz)
O << ".maxntid " << Maxntidx.value_or(1) << ", " << Maxntidy.value_or(1)
<< ", " << Maxntidz.value_or(1) << "\n";
const auto MaxNTID = getMaxNTID(F);
if (!MaxNTID.empty())
O << formatv(".maxntid {0:$[, ]}\n",
make_range(MaxNTID.begin(), MaxNTID.end()));
if (const auto Mincta = getMinCTASm(F))
O << ".minnctapersm " << *Mincta << "\n";
@ -458,21 +450,19 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
const auto *STI = static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());
if (STI->getSmVersion() >= 90) {
std::optional<unsigned> ClusterX = getClusterDimx(F);
std::optional<unsigned> ClusterY = getClusterDimy(F);
std::optional<unsigned> ClusterZ = getClusterDimz(F);
const auto ClusterDim = getClusterDim(F);
if (ClusterX || ClusterY || ClusterZ) {
if (!ClusterDim.empty()) {
O << ".explicitcluster\n";
if (ClusterX.value_or(1) != 0) {
assert(ClusterY.value_or(1) && ClusterZ.value_or(1) &&
if (ClusterDim[0] != 0) {
assert(llvm::all_of(ClusterDim, [](unsigned D) { return D != 0; }) &&
"cluster_dim_x != 0 implies cluster_dim_y and cluster_dim_z "
"should be non-zero as well");
O << ".reqnctapercluster " << ClusterX.value_or(1) << ", "
<< ClusterY.value_or(1) << ", " << ClusterZ.value_or(1) << "\n";
O << formatv(".reqnctapercluster {0:$[, ]}\n",
make_range(ClusterDim.begin(), ClusterDim.end()));
} else {
assert(!ClusterY.value_or(1) && !ClusterZ.value_or(1) &&
assert(llvm::all_of(ClusterDim, [](unsigned D) { return D == 0; }) &&
"cluster_dim_x == 0 implies cluster_dim_y and cluster_dim_z "
"should be 0 as well");
}

View File

@ -50,33 +50,10 @@ static std::string getHash(StringRef Str) {
return llvm::utohexstr(Hash.low(), /*LowerCase=*/true);
}
static void addKernelMetadata(Module &M, Function *F) {
llvm::LLVMContext &Ctx = M.getContext();
// Get "nvvm.annotations" metadata node.
llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
// This kernel is only to be called single-threaded.
llvm::Metadata *ThreadXMDVals[] = {
llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidx"),
llvm::ConstantAsMetadata::get(
llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
llvm::Metadata *ThreadYMDVals[] = {
llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidy"),
llvm::ConstantAsMetadata::get(
llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
llvm::Metadata *ThreadZMDVals[] = {
llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidz"),
llvm::ConstantAsMetadata::get(
llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
static void addKernelAttrs(Function *F) {
F->addFnAttr("nvvm.maxclusterrank", "1");
F->addFnAttr("nvvm.maxntid", "1");
F->setCallingConv(CallingConv::PTX_Kernel);
// Append metadata to nvvm.annotations.
MD->addOperand(llvm::MDNode::get(Ctx, ThreadXMDVals));
MD->addOperand(llvm::MDNode::get(Ctx, ThreadYMDVals));
MD->addOperand(llvm::MDNode::get(Ctx, ThreadZMDVals));
}
static Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) {
@ -88,7 +65,7 @@ static Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) {
Function *InitOrFiniKernel = Function::createWithDefaultAttr(
FunctionType::get(Type::getVoidTy(M.getContext()), false),
GlobalValue::WeakODRLinkage, 0, InitOrFiniKernelName, &M);
addKernelMetadata(M, InitOrFiniKernel);
addKernelAttrs(InitOrFiniKernel);
return InitOrFiniKernel;
}

View File

@ -575,13 +575,14 @@ unsigned NVPTXTTIImpl::getAssumedAddrSpace(const Value *V) const {
void NVPTXTTIImpl::collectKernelLaunchBounds(
const Function &F,
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
std::optional<unsigned> Val;
if ((Val = getMaxClusterRank(F)))
if (const auto Val = getMaxClusterRank(F))
LB.push_back({"maxclusterrank", *Val});
if ((Val = getMaxNTIDx(F)))
LB.push_back({"maxntidx", *Val});
if ((Val = getMaxNTIDy(F)))
LB.push_back({"maxntidy", *Val});
if ((Val = getMaxNTIDz(F)))
LB.push_back({"maxntidz", *Val});
const auto MaxNTID = getMaxNTID(F);
if (MaxNTID.size() > 0)
LB.push_back({"maxntidx", MaxNTID[0]});
if (MaxNTID.size() > 1)
LB.push_back({"maxntidy", MaxNTID[1]});
if (MaxNTID.size() > 2)
LB.push_back({"maxntidz", MaxNTID[2]});
}

View File

@ -13,6 +13,8 @@
#include "NVPTXUtilities.h"
#include "NVPTX.h"
#include "NVPTXTargetMachine.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
@ -20,6 +22,7 @@
#include "llvm/IR/Module.h"
#include "llvm/Support/Alignment.h"
#include "llvm/Support/Mutex.h"
#include <cstdint>
#include <cstring>
#include <map>
#include <mutex>
@ -196,6 +199,35 @@ static std::optional<unsigned> getFnAttrParsedInt(const Function &F,
: std::nullopt;
}
static SmallVector<unsigned, 3> getFnAttrParsedVector(const Function &F,
StringRef Attr) {
SmallVector<unsigned, 3> V;
auto &Ctx = F.getContext();
if (F.hasFnAttribute(Attr)) {
// We expect the attribute value to be of the form "x[,y[,z]]", where x, y,
// and z are unsigned values.
StringRef S = F.getFnAttribute(Attr).getValueAsString();
for (unsigned I = 0; I < 3 && !S.empty(); I++) {
auto [First, Rest] = S.split(",");
unsigned IntVal;
if (First.trim().getAsInteger(0, IntVal))
Ctx.emitError("can't parse integer attribute " + First + " in " + Attr);
V.push_back(IntVal);
S = Rest;
}
}
return V;
}
static std::optional<uint64_t> getVectorProduct(ArrayRef<unsigned> V) {
if (V.empty())
return std::nullopt;
return std::accumulate(V.begin(), V.end(), 1, std::multiplies<uint64_t>{});
}
bool isParamGridConstant(const Value &V) {
if (const Argument *Arg = dyn_cast<Argument>(&V)) {
// "grid_constant" counts argument indices starting from 1
@ -254,71 +286,39 @@ StringRef getSamplerName(const Value &V) {
return V.getName();
}
std::optional<unsigned> getMaxNTIDx(const Function &F) {
return findOneNVVMAnnotation(&F, "maxntidx");
SmallVector<unsigned, 3> getMaxNTID(const Function &F) {
return getFnAttrParsedVector(F, "nvvm.maxntid");
}
std::optional<unsigned> getMaxNTIDy(const Function &F) {
return findOneNVVMAnnotation(&F, "maxntidy");
SmallVector<unsigned, 3> getReqNTID(const Function &F) {
return getFnAttrParsedVector(F, "nvvm.reqntid");
}
std::optional<unsigned> getMaxNTIDz(const Function &F) {
return findOneNVVMAnnotation(&F, "maxntidz");
SmallVector<unsigned, 3> getClusterDim(const Function &F) {
return getFnAttrParsedVector(F, "nvvm.cluster_dim");
}
std::optional<unsigned> getMaxNTID(const Function &F) {
std::optional<uint64_t> getOverallMaxNTID(const Function &F) {
// Note: The semantics here are a bit strange. The PTX ISA states the
// following (11.4.2. Performance-Tuning Directives: .maxntid):
//
// Note that this directive guarantees that the total number of threads does
// not exceed the maximum, but does not guarantee that the limit in any
// particular dimension is not exceeded.
std::optional<unsigned> MaxNTIDx = getMaxNTIDx(F);
std::optional<unsigned> MaxNTIDy = getMaxNTIDy(F);
std::optional<unsigned> MaxNTIDz = getMaxNTIDz(F);
if (MaxNTIDx || MaxNTIDy || MaxNTIDz)
return MaxNTIDx.value_or(1) * MaxNTIDy.value_or(1) * MaxNTIDz.value_or(1);
return std::nullopt;
const auto MaxNTID = getMaxNTID(F);
return getVectorProduct(MaxNTID);
}
std::optional<unsigned> getClusterDimx(const Function &F) {
return findOneNVVMAnnotation(&F, "cluster_dim_x");
}
std::optional<unsigned> getClusterDimy(const Function &F) {
return findOneNVVMAnnotation(&F, "cluster_dim_y");
}
std::optional<unsigned> getClusterDimz(const Function &F) {
return findOneNVVMAnnotation(&F, "cluster_dim_z");
std::optional<uint64_t> getOverallReqNTID(const Function &F) {
// Note: The semantics here are a bit strange. See getMaxNTID.
const auto ReqNTID = getReqNTID(F);
return getVectorProduct(ReqNTID);
}
std::optional<unsigned> getMaxClusterRank(const Function &F) {
return getFnAttrParsedInt(F, "nvvm.maxclusterrank");
}
std::optional<unsigned> getReqNTIDx(const Function &F) {
return findOneNVVMAnnotation(&F, "reqntidx");
}
std::optional<unsigned> getReqNTIDy(const Function &F) {
return findOneNVVMAnnotation(&F, "reqntidy");
}
std::optional<unsigned> getReqNTIDz(const Function &F) {
return findOneNVVMAnnotation(&F, "reqntidz");
}
std::optional<unsigned> getReqNTID(const Function &F) {
// Note: The semantics here are a bit strange. See getMaxNTID.
std::optional<unsigned> ReqNTIDx = getReqNTIDx(F);
std::optional<unsigned> ReqNTIDy = getReqNTIDy(F);
std::optional<unsigned> ReqNTIDz = getReqNTIDz(F);
if (ReqNTIDx || ReqNTIDy || ReqNTIDz)
return ReqNTIDx.value_or(1) * ReqNTIDy.value_or(1) * ReqNTIDz.value_or(1);
return std::nullopt;
}
std::optional<unsigned> getMinCTASm(const Function &F) {
return getFnAttrParsedInt(F, "nvvm.minctasm");
}

View File

@ -14,6 +14,7 @@
#define LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H
#include "NVPTX.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/CallingConv.h"
@ -47,19 +48,12 @@ StringRef getTextureName(const Value &);
StringRef getSurfaceName(const Value &);
StringRef getSamplerName(const Value &);
std::optional<unsigned> getMaxNTIDx(const Function &);
std::optional<unsigned> getMaxNTIDy(const Function &);
std::optional<unsigned> getMaxNTIDz(const Function &);
std::optional<unsigned> getMaxNTID(const Function &);
SmallVector<unsigned, 3> getMaxNTID(const Function &);
SmallVector<unsigned, 3> getReqNTID(const Function &);
SmallVector<unsigned, 3> getClusterDim(const Function &);
std::optional<unsigned> getReqNTIDx(const Function &);
std::optional<unsigned> getReqNTIDy(const Function &);
std::optional<unsigned> getReqNTIDz(const Function &);
std::optional<unsigned> getReqNTID(const Function &);
std::optional<unsigned> getClusterDimx(const Function &);
std::optional<unsigned> getClusterDimy(const Function &);
std::optional<unsigned> getClusterDimz(const Function &);
std::optional<uint64_t> getOverallMaxNTID(const Function &);
std::optional<uint64_t> getOverallReqNTID(const Function &);
std::optional<unsigned> getMaxClusterRank(const Function &);
std::optional<unsigned> getMinCTASm(const Function &);

View File

@ -67,8 +67,8 @@ static bool runNVVMIntrRange(Function &F) {
unsigned x, y, z;
} MaxBlockSize, MaxGridSize;
const unsigned MetadataNTID = getReqNTID(F).value_or(
getMaxNTID(F).value_or(std::numeric_limits<unsigned>::max()));
const unsigned MetadataNTID = getOverallReqNTID(F).value_or(
getOverallMaxNTID(F).value_or(std::numeric_limits<unsigned>::max()));
MaxBlockSize.x = std::min(1024u, MetadataNTID);
MaxBlockSize.y = std::min(1024u, MetadataNTID);

View File

@ -24,11 +24,11 @@ attributes #0 = {
"omp_target_num_teams"="100"
"omp_target_thread_limit"="101"
"nvvm.maxclusterrank"="200"
"nvvm.maxntid"="210,211,212"
}
!llvm.module.flags = !{!0}
!llvm.dbg.cu = !{!1}
!nvvm.annotations = !{!7, !8, !9, !10}
!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
@ -36,7 +36,3 @@ attributes #0 = {
!3 = !{}
!4 = !DISubroutineType(types: !3)
!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3)
!7 = !{ptr @test, !"maxntidx", i32 210}
!8 = !{ptr @test, !"maxntidy", i32 211}
!9 = !{ptr @test, !"maxntidz", i32 212}
!10 = distinct !{ptr null, !"kernel", i32 1}

View File

@ -9,14 +9,14 @@
; CHECK: .global .surfref surface
; CHECK: .entry kernel_func_maxntid
define void @kernel_func_maxntid(ptr %a) {
define ptx_kernel void @kernel_func_maxntid(ptr %a) "nvvm.maxntid"="10,20,30" {
; CHECK: .maxntid 10, 20, 30
; CHECK: ret
ret void
}
; CHECK: .entry kernel_func_reqntid
define void @kernel_func_reqntid(ptr %a) {
define ptx_kernel void @kernel_func_reqntid(ptr %a) "nvvm.reqntid"="11,22,33" {
; CHECK: .reqntid 11, 22, 33
; CHECK: ret
ret void
@ -36,13 +36,7 @@ define ptx_kernel void @kernel_func_maxnreg() "nvvm.maxnreg"="1234" {
ret void
}
!nvvm.annotations = !{!1, !2, !3, !4, !9, !10}
!1 = !{ptr @kernel_func_maxntid, !"kernel", i32 1}
!2 = !{ptr @kernel_func_maxntid, !"maxntidx", i32 10, !"maxntidy", i32 20, !"maxntidz", i32 30}
!3 = !{ptr @kernel_func_reqntid, !"kernel", i32 1}
!4 = !{ptr @kernel_func_reqntid, !"reqntidx", i32 11, !"reqntidy", i32 22, !"reqntidz", i32 33}
!nvvm.annotations = !{!9, !10}
!9 = !{ptr addrspace(1) @texture, !"texture", i32 1}
!10 = !{ptr addrspace(1) @surface, !"surface", i32 1}

View File

@ -26,8 +26,4 @@ bb:
ret void
}
attributes #0 = { norecurse nounwind "polly.skip.fn" }
!nvvm.annotations = !{!0}
!0 = !{ptr @spam, !"maxntidx", i64 1, !"maxntidy", i64 1, !"maxntidz", i64 1}
attributes #0 = { norecurse nounwind "polly.skip.fn" "nvvm.maxntid"="1,1,1" }

View File

@ -3,7 +3,7 @@
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 | FileCheck -check-prefixes=CHECK90 %s
; RUN: %if ptxas-12.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 | %ptxas-verify -arch=sm_90 %}
define ptx_kernel void @kernel_func_clusterxyz() {
define ptx_kernel void @kernel_func_clusterxyz() "nvvm.cluster_dim"="3,5,7" {
; CHECK80-LABEL: kernel_func_clusterxyz(
; CHECK80: {
; CHECK80-EMPTY:
@ -21,8 +21,3 @@ define ptx_kernel void @kernel_func_clusterxyz() {
; CHECK90-NEXT: ret;
ret void
}
!nvvm.annotations = !{!1}
!1 = !{ptr @kernel_func_clusterxyz, !"cluster_dim_x", i32 3, !"cluster_dim_y", i32 5, !"cluster_dim_z", i32 7}

View File

@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --version 5
; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -mcpu=sm_20 -passes=nvvm-intr-range | FileCheck %s
define ptx_kernel i32 @test_maxntid() {
define ptx_kernel i32 @test_maxntid() "nvvm.maxntid"="32,1,3" {
; CHECK-LABEL: define ptx_kernel i32 @test_maxntid(
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 96) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
@ -31,9 +31,9 @@ define ptx_kernel i32 @test_maxntid() {
ret i32 %11
}
define ptx_kernel i32 @test_reqntid() {
define ptx_kernel i32 @test_reqntid() "nvvm.reqntid"="20" {
; CHECK-LABEL: define ptx_kernel i32 @test_reqntid(
; CHECK-SAME: ) #[[ATTR0]] {
; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
; CHECK-NEXT: [[TMP5:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
; CHECK-NEXT: [[TMP2:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.z()
@ -64,9 +64,9 @@ define ptx_kernel i32 @test_reqntid() {
;; A case like this could occur if a function with the sreg intrinsic was
;; inlined into a kernel where the tid metadata is present, ensure the range is
;; updated.
define ptx_kernel i32 @test_inlined() {
define ptx_kernel i32 @test_inlined() "nvvm.maxntid"="4" {
; CHECK-LABEL: define ptx_kernel i32 @test_inlined(
; CHECK-SAME: ) #[[ATTR0]] {
; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 4) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
; CHECK-NEXT: ret i32 [[TMP1]]
;
@ -81,8 +81,3 @@ declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
!nvvm.annotations = !{!0, !1, !2}
!0 = !{ptr @test_maxntid, !"maxntidx", i32 32, !"maxntidz", i32 3}
!1 = !{ptr @test_reqntid, !"reqntidx", i32 20}
!2 = !{ptr @test_inlined, !"maxntidx", i32 4}

View File

@ -84,11 +84,4 @@ define internal void @bar() {
; CHECK: while.end:
; CHECK-NEXT: ret void
; CHECK: attributes #[[ATTR0]] = { "nvvm.maxclusterrank"="1" }
; CHECK: [[META1:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidx", i32 1}
; CHECK: [[META2:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidy", i32 1}
; CHECK: [[META3:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidz", i32 1}
; CHECK: [[META4:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidx", i32 1}
; CHECK: [[META5:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidy", i32 1}
; CHECK: [[META6:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidz", i32 1}
; CHECK: attributes #[[ATTR0]] = { "nvvm.maxclusterrank"="1" "nvvm.maxntid"="1" }

View File

@ -4,20 +4,16 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-unknown-unknown"
; CHECK: .maxntid 128, 1, 1
; CHECK: .maxntid 128
; CHECK: .minnctapersm 2
; CHECK_SM_90: .maxclusterrank 8
; CHECK_SM_80-NOT: .maxclusterrank 8
; Make sure that for SM version prior to 90 `.maxclusterrank` directive is
; silently ignored.
define dso_local ptx_kernel void @_Z18TestMaxClusterRankv() "nvvm.minctasm"="2" "nvvm.maxclusterrank"="8" {
define dso_local ptx_kernel void @_Z18TestMaxClusterRankv() "nvvm.minctasm"="2" "nvvm.maxclusterrank"="8" "nvvm.maxntid"="128" {
entry:
%a = alloca i32, align 4
store volatile i32 1, ptr %a, align 4
ret void
}
!nvvm.annotations = !{!1}
!1 = !{ptr @_Z18TestMaxClusterRankv, !"maxntidx", i32 128}

View File

@ -48,7 +48,55 @@ define void @test_maxnreg() {
ret void
}
!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6}
define void @test_maxntid_1() {
; CHECK-LABEL: define void @test_maxntid_1(
; CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
; CHECK-NEXT: ret void
;
ret void
}
define void @test_maxntid_2() {
; CHECK-LABEL: define void @test_maxntid_2(
; CHECK-SAME: ) #[[ATTR5:[0-9]+]] {
; CHECK-NEXT: ret void
;
ret void
}
define void @test_maxntid_3() {
; CHECK-LABEL: define void @test_maxntid_3(
; CHECK-SAME: ) #[[ATTR6:[0-9]+]] {
; CHECK-NEXT: ret void
;
ret void
}
define void @test_maxntid_4() {
; CHECK-LABEL: define void @test_maxntid_4(
; CHECK-SAME: ) #[[ATTR7:[0-9]+]] {
; CHECK-NEXT: ret void
;
ret void
}
define void @test_reqntid() {
; CHECK-LABEL: define void @test_reqntid(
; CHECK-SAME: ) #[[ATTR8:[0-9]+]] {
; CHECK-NEXT: ret void
;
ret void
}
define void @test_cluster_dim() {
; CHECK-LABEL: define void @test_cluster_dim(
; CHECK-SAME: ) #[[ATTR9:[0-9]+]] {
; CHECK-NEXT: ret void
;
ret void
}
!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12}
!0 = !{ptr @test_align, !"align", i32 u0x00000008, !"align", i32 u0x00010008, !"align", i32 u0x00020010}
!1 = !{null, !"align", i32 u0x00000008, !"align", i32 u0x00010008, !"align", i32 u0x00020008}
@ -57,12 +105,24 @@ define void @test_maxnreg() {
!4 = !{ptr @test_cluster_max_blocks, !"cluster_max_blocks", i32 3}
!5 = !{ptr @test_minctasm, !"minctasm", i32 4}
!6 = !{ptr @test_maxnreg, !"maxnreg", i32 5}
!7 = !{ptr @test_maxntid_1, !"maxntidx", i32 50}
!8 = !{ptr @test_maxntid_2, !"maxntidx", i32 11, !"maxntidy", i32 22, !"maxntidz", i32 33}
!9 = !{ptr @test_maxntid_3, !"maxntidz", i32 11, !"maxntidy", i32 22, !"maxntidx", i32 33}
!10 = !{ptr @test_maxntid_4, !"maxntidz", i32 100}
!11 = !{ptr @test_reqntid, !"reqntidx", i32 31, !"reqntidy", i32 32, !"reqntidz", i32 33}
!12 = !{ptr @test_cluster_dim, !"cluster_dim_x", i32 101, !"cluster_dim_y", i32 102, !"cluster_dim_z", i32 103}
;.
; CHECK: attributes #[[ATTR0]] = { "nvvm.maxclusterrank"="2" }
; CHECK: attributes #[[ATTR1]] = { "nvvm.maxclusterrank"="3" }
; CHECK: attributes #[[ATTR2]] = { "nvvm.minctasm"="4" }
; CHECK: attributes #[[ATTR3]] = { "nvvm.maxnreg"="5" }
; CHECK: attributes #[[ATTR4]] = { "nvvm.maxntid"="50" }
; CHECK: attributes #[[ATTR5]] = { "nvvm.maxntid"="11,22,33" }
; CHECK: attributes #[[ATTR6]] = { "nvvm.maxntid"="33,22,11" }
; CHECK: attributes #[[ATTR7]] = { "nvvm.maxntid"="1,1,100" }
; CHECK: attributes #[[ATTR8]] = { "nvvm.reqntid"="31,32,33" }
; CHECK: attributes #[[ATTR9]] = { "nvvm.cluster_dim"="101,102,103" }
;.
; CHECK: [[META0:![0-9]+]] = !{ptr @test_align, !"align", i32 8}
;.

View File

@ -18,8 +18,10 @@
#include "mlir/Target/LLVMIR/ModuleTranslation.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
#include "llvm/Support/FormatVariadic.h"
using namespace mlir;
using namespace mlir::LLVM;
@ -195,48 +197,33 @@ public:
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
if (!func)
return failure();
llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext();
llvm::Function *llvmFunc = moduleTranslation.lookupFunction(func.getName());
auto generateMetadata = [&](int dim, StringRef name) {
llvm::Metadata *llvmMetadata[] = {
llvm::ValueAsMetadata::get(llvmFunc),
llvm::MDString::get(llvmContext, name),
llvm::ValueAsMetadata::get(llvm::ConstantInt::get(
llvm::Type::getInt32Ty(llvmContext), dim))};
llvm::MDNode *llvmMetadataNode =
llvm::MDNode::get(llvmContext, llvmMetadata);
moduleTranslation.getOrInsertNamedModuleMetadata("nvvm.annotations")
->addOperand(llvmMetadataNode);
};
if (attribute.getName() == NVVM::NVVMDialect::getMaxntidAttrName()) {
if (!dyn_cast<DenseI32ArrayAttr>(attribute.getValue()))
return failure();
auto values = cast<DenseI32ArrayAttr>(attribute.getValue());
generateMetadata(values[0], NVVM::NVVMDialect::getMaxntidXName());
if (values.size() > 1)
generateMetadata(values[1], NVVM::NVVMDialect::getMaxntidYName());
if (values.size() > 2)
generateMetadata(values[2], NVVM::NVVMDialect::getMaxntidZName());
const std::string attr = llvm::formatv(
"{0:$[,]}", llvm::make_range(values.asArrayRef().begin(),
values.asArrayRef().end()));
llvmFunc->addFnAttr("nvvm.maxntid", attr);
} else if (attribute.getName() == NVVM::NVVMDialect::getReqntidAttrName()) {
if (!dyn_cast<DenseI32ArrayAttr>(attribute.getValue()))
return failure();
auto values = cast<DenseI32ArrayAttr>(attribute.getValue());
generateMetadata(values[0], NVVM::NVVMDialect::getReqntidXName());
if (values.size() > 1)
generateMetadata(values[1], NVVM::NVVMDialect::getReqntidYName());
if (values.size() > 2)
generateMetadata(values[2], NVVM::NVVMDialect::getReqntidZName());
const std::string attr = llvm::formatv(
"{0:$[,]}", llvm::make_range(values.asArrayRef().begin(),
values.asArrayRef().end()));
llvmFunc->addFnAttr("nvvm.reqntid", attr);
} else if (attribute.getName() ==
NVVM::NVVMDialect::getClusterDimAttrName()) {
if (!dyn_cast<DenseI32ArrayAttr>(attribute.getValue()))
return failure();
auto values = cast<DenseI32ArrayAttr>(attribute.getValue());
generateMetadata(values[0], NVVM::NVVMDialect::getClusterDimXName());
if (values.size() > 1)
generateMetadata(values[1], NVVM::NVVMDialect::getClusterDimYName());
if (values.size() > 2)
generateMetadata(values[2], NVVM::NVVMDialect::getClusterDimZName());
const std::string attr = llvm::formatv(
"{0:$[,]}", llvm::make_range(values.asArrayRef().begin(),
values.asArrayRef().end()));
llvmFunc->addFnAttr("nvvm.cluster_dim", attr);
} else if (attribute.getName() ==
NVVM::NVVMDialect::getClusterMaxBlocksAttrName()) {
auto value = dyn_cast<IntegerAttr>(attribute.getValue());

View File

@ -590,33 +590,24 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.maxntid = array<i32: 1, 2
llvm.return
}
// CHECK: define ptx_kernel void @kernel_func
// CHECK: !nvvm.annotations =
// CHECK: {ptr @kernel_func, !"maxntidx", i32 1}
// CHECK: {ptr @kernel_func, !"maxntidy", i32 23}
// CHECK: {ptr @kernel_func, !"maxntidz", i32 32}
// CHECK: define ptx_kernel void @kernel_func() #[[ATTR0:[0-9]+]]
// CHECK: attributes #[[ATTR0]] = { "nvvm.maxntid"="1,23,32" }
// -----
llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.reqntid = array<i32: 1, 23, 32>} {
llvm.return
}
// CHECK: define ptx_kernel void @kernel_func
// CHECK: !nvvm.annotations =
// CHECK: {ptr @kernel_func, !"reqntidx", i32 1}
// CHECK: {ptr @kernel_func, !"reqntidy", i32 23}
// CHECK: {ptr @kernel_func, !"reqntidz", i32 32}
// CHECK: define ptx_kernel void @kernel_func() #[[ATTR0:[0-9]+]]
// CHECK: attributes #[[ATTR0]] = { "nvvm.reqntid"="1,23,32" }
// -----
llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.cluster_dim = array<i32: 3, 5, 7>} {
llvm.return
}
// CHECK: define ptx_kernel void @kernel_func
// CHECK: !nvvm.annotations =
// CHECK: {ptr @kernel_func, !"cluster_dim_x", i32 3}
// CHECK: {ptr @kernel_func, !"cluster_dim_y", i32 5}
// CHECK: {ptr @kernel_func, !"cluster_dim_z", i32 7}
// CHECK: define ptx_kernel void @kernel_func() #[[ATTR0:[0-9]+]]
// CHECK: attributes #[[ATTR0]] = { "nvvm.cluster_dim"="3,5,7" }
// -----
llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.cluster_max_blocks = 8} {
@ -650,11 +641,7 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.maxntid = array<i32: 1, 2
}
// CHECK: define ptx_kernel void @kernel_func() #[[ATTR0:[0-9]+]]
// CHECK: attributes #[[ATTR0]] = { "nvvm.maxnreg"="32" "nvvm.minctasm"="16" }
// CHECK: !nvvm.annotations =
// CHECK: {ptr @kernel_func, !"maxntidx", i32 1}
// CHECK: {ptr @kernel_func, !"maxntidy", i32 23}
// CHECK: {ptr @kernel_func, !"maxntidz", i32 32}
// CHECK: attributes #[[ATTR0]] = { "nvvm.maxnreg"="32" "nvvm.maxntid"="1,23,32" "nvvm.minctasm"="16" }
// -----
// CHECK: define ptx_kernel void @kernel_func