[NVPTX] Convert vector function nvvm.annotations to attributes (#127736)
Replace some more nvvm.annotations with function attributes, auto-upgrading the annotations as needed. These new attributes will be more idiomatic and compile-time efficient than the annotations. - !"maxntid[xyz]" -> "nvvm.maxntid" - !"reqntid[xyz]" -> "nvvm.reqntid" - !"cluster_dim_[xyz]" -> "nvvm.cluster_dim"
This commit is contained in:
parent
14da7d5c1f
commit
6c2e170d04
@ -357,17 +357,13 @@ void CodeGenModule::handleCUDALaunchBoundsAttr(llvm::Function *F,
|
||||
int32_t *MaxThreadsVal,
|
||||
int32_t *MinBlocksVal,
|
||||
int32_t *MaxClusterRankVal) {
|
||||
// Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
|
||||
llvm::APSInt MaxThreads(32);
|
||||
MaxThreads = Attr->getMaxThreads()->EvaluateKnownConstInt(getContext());
|
||||
if (MaxThreads > 0) {
|
||||
if (MaxThreadsVal)
|
||||
*MaxThreadsVal = MaxThreads.getExtValue();
|
||||
if (F) {
|
||||
// Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
|
||||
NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx",
|
||||
MaxThreads.getExtValue());
|
||||
}
|
||||
if (F)
|
||||
F->addFnAttr("nvvm.maxntid", llvm::utostr(MaxThreads.getExtValue()));
|
||||
}
|
||||
|
||||
// min and max blocks is an optional argument for CUDALaunchBoundsAttr. If it
|
||||
|
||||
@ -10,23 +10,30 @@
|
||||
#endif
|
||||
|
||||
// CHECK: @Kernel1() #[[ATTR0:[0-9]+]]
|
||||
// CHECK: @Kernel2() #[[ATTR1:[0-9]+]]
|
||||
// CHECK: @{{.*}}Kernel3{{.*}}() #[[ATTR1]]
|
||||
// CHECK: @{{.*}}Kernel4{{.*}}() #[[ATTR0]]
|
||||
// CHECK: @{{.*}}Kernel5{{.*}}() #[[ATTR1:[0-9]+]]
|
||||
// CHECK: @{{.*}}Kernel6{{.*}}() #[[ATTR0]]
|
||||
// CHECK: @{{.*}}Kernel8{{.*}}() #[[ATTR3:[0-9]+]]
|
||||
// CHECK: @{{.*}}Kernel5{{.*}}() #[[ATTR2:[0-9]+]]
|
||||
// CHECK: @{{.*}}Kernel6{{.*}}() #[[ATTR3:[0-9]+]]
|
||||
// CHECK: @{{.*}}Kernel7{{.*}}() #[[ATTR1]]
|
||||
// CHECK: @{{.*}}Kernel8{{.*}}() #[[ATTR4:[0-9]+]]
|
||||
|
||||
// CHECK: attributes #[[ATTR0]] = {{{.*}} "nvvm.minctasm"="2" {{.*}}}
|
||||
// CHECK: attributes #[[ATTR1]] = {{{.*}} "nvvm.minctasm"="258" {{.*}}}
|
||||
// CHECK: attributes #[[ATTR3]] = {{{.*}} "nvvm.minctasm"="12" {{.*}}}
|
||||
// CHECK-DAG: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxntid"="256" "nvvm.minctasm"="2" {{.*}}}
|
||||
// CHECK-DAG: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxntid"="256" {{.*}}}
|
||||
// CHECK-DAG: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="356" "nvvm.minctasm"="258" {{.*}}}
|
||||
// CHECK-DAG: attributes #[[ATTR3]] = {{{.*}} "nvvm.minctasm"="2" {{.*}}}
|
||||
// CHECK-DAG: attributes #[[ATTR4]] = {{{.*}} "nvvm.maxntid"="100" "nvvm.minctasm"="12" {{.*}}}
|
||||
|
||||
// CHECK_MAX_BLOCKS: @Kernel1_sm_90() #[[ATTR4:[0-9]+]]
|
||||
// CHECK_MAX_BLOCKS: @{{.*}}Kernel4_sm_90{{.*}} #[[ATTR4]]
|
||||
// CHECK_MAX_BLOCKS: @{{.*}}Kernel5_sm_90{{.*}} #[[ATTR5:[0-9]+]]
|
||||
// CHECK_MAX_BLOCKS: @{{.*}}Kernel8_sm_90{{.*}} #[[ATTR6:[0-9]+]]
|
||||
// CHECK_MAX_BLOCKS: @Kernel1_sm_90() #[[ATTR0:[0-9]+]]
|
||||
// CHECK_MAX_BLOCKS: @{{.*}}Kernel4_sm_90{{.*}} #[[ATTR0]]
|
||||
// CHECK_MAX_BLOCKS: @{{.*}}Kernel5_sm_90{{.*}} #[[ATTR1:[0-9]+]]
|
||||
// CHECK_MAX_BLOCKS: @{{.*}}Kernel7_sm_90{{.*}} #[[ATTR2:[0-9]+]]
|
||||
// CHECK_MAX_BLOCKS: @{{.*}}Kernel8_sm_90{{.*}} #[[ATTR3:[0-9]+]]
|
||||
|
||||
// CHECK_MAX_BLOCKS: attributes #[[ATTR4]] = {{{.*}} "nvvm.maxclusterrank"="4" "nvvm.minctasm"="2" {{.*}}}
|
||||
// CHECK_MAX_BLOCKS: attributes #[[ATTR5]] = {{{.*}} "nvvm.maxclusterrank"="260" "nvvm.minctasm"="258" {{.*}}}
|
||||
// CHECK_MAX_BLOCKS: attributes #[[ATTR6]] = {{{.*}} "nvvm.maxclusterrank"="14" "nvvm.minctasm"="12" {{.*}}}
|
||||
// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxclusterrank"="4" "nvvm.maxntid"="256" "nvvm.minctasm"="2" {{.*}}}
|
||||
// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxclusterrank"="260" "nvvm.maxntid"="356" "nvvm.minctasm"="258" {{.*}}}
|
||||
// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="256" {{.*}}}
|
||||
// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR3]] = {{{.*}} "nvvm.maxclusterrank"="14" "nvvm.maxntid"="100" "nvvm.minctasm"="12" {{.*}}}
|
||||
|
||||
// Test both max threads per block and Min cta per sm.
|
||||
extern "C" {
|
||||
@ -37,8 +44,6 @@ Kernel1()
|
||||
}
|
||||
}
|
||||
|
||||
// CHECK: !{{[0-9]+}} = !{ptr @Kernel1, !"maxntidx", i32 256}
|
||||
|
||||
#ifdef USE_MAX_BLOCKS
|
||||
// Test max threads per block and min/max cta per sm.
|
||||
extern "C" {
|
||||
@ -48,8 +53,6 @@ Kernel1_sm_90()
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"maxntidx", i32 256}
|
||||
#endif // USE_MAX_BLOCKS
|
||||
|
||||
// Test only max threads per block. Min cta per sm defaults to 0, and
|
||||
@ -62,8 +65,6 @@ Kernel2()
|
||||
}
|
||||
}
|
||||
|
||||
// CHECK: !{{[0-9]+}} = !{ptr @Kernel2, !"maxntidx", i32 256}
|
||||
|
||||
template <int max_threads_per_block>
|
||||
__global__ void
|
||||
__launch_bounds__(max_threads_per_block)
|
||||
@ -72,7 +73,6 @@ Kernel3()
|
||||
}
|
||||
|
||||
template __global__ void Kernel3<MAX_THREADS_PER_BLOCK>();
|
||||
// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel3{{.*}}, !"maxntidx", i32 256}
|
||||
|
||||
template <int max_threads_per_block, int min_blocks_per_mp>
|
||||
__global__ void
|
||||
@ -82,7 +82,6 @@ Kernel4()
|
||||
}
|
||||
template __global__ void Kernel4<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
|
||||
|
||||
// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4{{.*}}, !"maxntidx", i32 256}
|
||||
|
||||
#ifdef USE_MAX_BLOCKS
|
||||
template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp>
|
||||
@ -93,7 +92,6 @@ Kernel4_sm_90()
|
||||
}
|
||||
template __global__ void Kernel4_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();
|
||||
|
||||
// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"maxntidx", i32 256}
|
||||
#endif //USE_MAX_BLOCKS
|
||||
|
||||
const int constint = 100;
|
||||
@ -106,8 +104,6 @@ Kernel5()
|
||||
}
|
||||
template __global__ void Kernel5<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
|
||||
|
||||
// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5{{.*}}, !"maxntidx", i32 356}
|
||||
|
||||
#ifdef USE_MAX_BLOCKS
|
||||
|
||||
template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp>
|
||||
@ -120,7 +116,6 @@ Kernel5_sm_90()
|
||||
}
|
||||
template __global__ void Kernel5_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();
|
||||
|
||||
// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"maxntidx", i32 356}
|
||||
#endif //USE_MAX_BLOCKS
|
||||
|
||||
// Make sure we don't emit negative launch bounds values.
|
||||
@ -129,15 +124,12 @@ __launch_bounds__( -MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP )
|
||||
Kernel6()
|
||||
{
|
||||
}
|
||||
// CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel6{{.*}}, !"maxntidx",
|
||||
|
||||
__global__ void
|
||||
__launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP )
|
||||
Kernel7()
|
||||
{
|
||||
}
|
||||
// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, !"maxntidx",
|
||||
// CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, !"minctasm",
|
||||
|
||||
#ifdef USE_MAX_BLOCKS
|
||||
__global__ void
|
||||
@ -145,17 +137,12 @@ __launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP, -MAX_BLOCKS_PER_MP
|
||||
Kernel7_sm_90()
|
||||
{
|
||||
}
|
||||
// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"maxntidx",
|
||||
// CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"minctasm",
|
||||
// CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"maxclusterrank",
|
||||
#endif // USE_MAX_BLOCKS
|
||||
|
||||
const char constchar = 12;
|
||||
__global__ void __launch_bounds__(constint, constchar) Kernel8() {}
|
||||
// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel8{{.*}}, !"maxntidx", i32 100
|
||||
|
||||
#ifdef USE_MAX_BLOCKS
|
||||
const char constchar_2 = 14;
|
||||
__global__ void __launch_bounds__(constint, constchar, constchar_2) Kernel8_sm_90() {}
|
||||
// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"maxntidx", i32 100
|
||||
#endif // USE_MAX_BLOCKS
|
||||
|
||||
@ -11,9 +11,13 @@
|
||||
|
||||
// Check that the target attributes are set on the generated kernel
|
||||
void func() {
|
||||
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l18(ptr {{[^,]+}}) #0
|
||||
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l20(ptr {{[^,]+}})
|
||||
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #4
|
||||
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #0
|
||||
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l24(ptr {{[^,]+}})
|
||||
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l26(ptr {{[^,]+}}) #4
|
||||
|
||||
// NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #[[ATTR0:[0-9]+]]
|
||||
// NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l24(ptr {{[^,]+}}) #[[ATTR1:[0-9]+]]
|
||||
// NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l26(ptr {{[^,]+}}) #[[ATTR2:[0-9]+]]
|
||||
|
||||
#pragma omp target ompx_attribute([[clang::amdgpu_flat_work_group_size(10, 20)]])
|
||||
{}
|
||||
@ -34,9 +38,12 @@ void func() {
|
||||
// AMD-SAME: "omp_target_thread_limit"="17"
|
||||
|
||||
// It is unclear if we should use the AMD annotations for other targets, we do for now.
|
||||
// NVIDIA: "omp_target_thread_limit"="20"
|
||||
// NVIDIA: "omp_target_thread_limit"="45"
|
||||
// NVIDIA: "omp_target_thread_limit"="17"
|
||||
// NVIDIA: !{ptr @__omp_offloading[[HASH1:.*]]_l18, !"maxntidx", i32 20}
|
||||
// NVIDIA: !{ptr @__omp_offloading[[HASH2:.*]]_l20, !"maxntidx", i32 45}
|
||||
// NVIDIA: !{ptr @__omp_offloading[[HASH3:.*]]_l22, !"maxntidx", i32 17}
|
||||
// NVIDIA: attributes #[[ATTR0]]
|
||||
// NVIDIA-SAME: "nvvm.maxntid"="20"
|
||||
// NVIDIA-SAME: "omp_target_thread_limit"="20"
|
||||
// NVIDIA: attributes #[[ATTR1]]
|
||||
// NVIDIA-SAME: "nvvm.maxntid"="45"
|
||||
// NVIDIA-SAME: "omp_target_thread_limit"="45"
|
||||
// NVIDIA: attributes #[[ATTR2]]
|
||||
// NVIDIA-SAME: "nvvm.maxntid"="17"
|
||||
// NVIDIA-SAME: "omp_target_thread_limit"="17"
|
||||
|
||||
@ -7,23 +7,21 @@
|
||||
#define HEADER
|
||||
|
||||
void foo(int N) {
|
||||
// CHECK: l11, !"maxntidx", i32 128}
|
||||
// CHECK: define {{.*}}l11{{.*}} #[[ATTR0:[0-9]+]]
|
||||
#pragma omp target teams distribute parallel for simd
|
||||
for (int i = 0; i < N; ++i)
|
||||
;
|
||||
// CHECK: l15, !"maxntidx", i32 4}
|
||||
// CHECK: define {{.*}}l15{{.*}} #[[ATTR1:[0-9]+]]
|
||||
#pragma omp target teams distribute parallel for simd thread_limit(4)
|
||||
for (int i = 0; i < N; ++i)
|
||||
;
|
||||
// CHECK-NOT: l21, !"maxntidx", i32 128}
|
||||
// CHECK: l21, !"maxntidx", i32 42}
|
||||
// CHECK-NOT: l21, !"maxntidx", i32 128}
|
||||
|
||||
// CHECK: define {{.*}}l20{{.*}} #[[ATTR2:[0-9]+]]
|
||||
#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42))))
|
||||
for (int i = 0; i < N; ++i)
|
||||
;
|
||||
// CHECK-NOT: l27, !"maxntidx", i32 42}
|
||||
// CHECK: l27, !"maxntidx", i32 22}
|
||||
// CHECK-NOT: l27, !"maxntidx", i32 42}
|
||||
|
||||
// CHECK: define {{.*}}l25{{.*}} #[[ATTR3:[0-9]+]]
|
||||
#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) num_threads(22)
|
||||
for (int i = 0; i < N; ++i)
|
||||
;
|
||||
@ -31,3 +29,7 @@ void foo(int N) {
|
||||
|
||||
#endif
|
||||
|
||||
// CHECK: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxntid"="128" {{.*}}}
|
||||
// CHECK: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxntid"="4" {{.*}}}
|
||||
// CHECK: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="42" {{.*}}}
|
||||
// CHECK: attributes #[[ATTR3]] = {{{.*}} "nvvm.maxntid"="22" {{.*}}}
|
||||
|
||||
@ -74,6 +74,23 @@ Function Attributes
|
||||
This attribute indicates the maximum number of registers to be used for the
|
||||
kernel function.
|
||||
|
||||
``"nvvm.maxntid"="<x>[,<y>[,<z>]]"``
|
||||
This attribute declares the maximum number of threads in the thread block
|
||||
(CTA). The maximum number of threads is the product of the maximum extent in
|
||||
each dimension. Exceeding the maximum number of threads results in a runtime
|
||||
error or kernel launch failure.
|
||||
|
||||
``"nvvm.reqntid"="<x>[,<y>[,<z>]]"``
|
||||
This attribute declares the exact number of threads in the thread block
|
||||
(CTA). The number of threads is the product of the value in each dimension.
|
||||
Specifying a different CTA dimension at launch will result in a runtime
|
||||
error or kernel launch failure.
|
||||
|
||||
``"nvvm.cluster_dim"="<x>[,<y>[,<z>]]"``
|
||||
This attribute declares the number of thread blocks (CTAs) in the cluster.
|
||||
The total number of CTAs is the product of the number of CTAs in each
|
||||
dimension. Specifying a different cluster dimension at launch will result in
|
||||
a runtime error or kernel launch failure. Only supported for Hopper+.
|
||||
|
||||
.. _address_spaces:
|
||||
|
||||
|
||||
@ -6406,45 +6406,13 @@ void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
|
||||
KernelEnvironmentGV->setInitializer(NewInitializer);
|
||||
}
|
||||
|
||||
static MDNode *getNVPTXMDNode(Function &Kernel, StringRef Name) {
|
||||
Module &M = *Kernel.getParent();
|
||||
NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
|
||||
for (auto *Op : MD->operands()) {
|
||||
if (Op->getNumOperands() != 3)
|
||||
continue;
|
||||
auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
|
||||
if (!KernelOp || KernelOp->getValue() != &Kernel)
|
||||
continue;
|
||||
auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
|
||||
if (!Prop || Prop->getString() != Name)
|
||||
continue;
|
||||
return Op;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value,
|
||||
bool Min) {
|
||||
// Update the "maxntidx" metadata for NVIDIA, or add it.
|
||||
MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name);
|
||||
if (ExistingOp) {
|
||||
auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
|
||||
int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
|
||||
ExistingOp->replaceOperandWith(
|
||||
2, ConstantAsMetadata::get(ConstantInt::get(
|
||||
OldVal->getValue()->getType(),
|
||||
Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value))));
|
||||
} else {
|
||||
LLVMContext &Ctx = Kernel.getContext();
|
||||
Metadata *MDVals[] = {ConstantAsMetadata::get(&Kernel),
|
||||
MDString::get(Ctx, Name),
|
||||
ConstantAsMetadata::get(
|
||||
ConstantInt::get(Type::getInt32Ty(Ctx), Value))};
|
||||
// Append metadata to nvvm.annotations
|
||||
Module &M = *Kernel.getParent();
|
||||
NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
|
||||
MD->addOperand(MDNode::get(Ctx, MDVals));
|
||||
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
|
||||
bool Min) {
|
||||
if (Kernel.hasFnAttribute(Name)) {
|
||||
int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
|
||||
Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
|
||||
}
|
||||
Kernel.addFnAttr(Name, llvm::utostr(Value));
|
||||
}
|
||||
|
||||
std::pair<int32_t, int32_t>
|
||||
@ -6466,9 +6434,8 @@ OpenMPIRBuilder::readThreadBoundsForKernel(const Triple &T, Function &Kernel) {
|
||||
return {LB, UB};
|
||||
}
|
||||
|
||||
if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) {
|
||||
auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
|
||||
int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
|
||||
if (Kernel.hasFnAttribute("nvvm.maxntid")) {
|
||||
int32_t UB = Kernel.getFnAttributeAsParsedInteger("nvvm.maxntid");
|
||||
return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
|
||||
}
|
||||
return {0, ThreadLimit};
|
||||
@ -6485,7 +6452,7 @@ void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple &T,
|
||||
return;
|
||||
}
|
||||
|
||||
updateNVPTXMetadata(Kernel, "maxntidx", UB, true);
|
||||
updateNVPTXAttr(Kernel, "nvvm.maxntid", UB, true);
|
||||
}
|
||||
|
||||
std::pair<int32_t, int32_t>
|
||||
|
||||
@ -13,11 +13,13 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "llvm/IR/AutoUpgrade.h"
|
||||
#include "llvm/ADT/ArrayRef.h"
|
||||
#include "llvm/ADT/StringExtras.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/ADT/StringSwitch.h"
|
||||
#include "llvm/BinaryFormat/Dwarf.h"
|
||||
#include "llvm/IR/AttributeMask.h"
|
||||
#include "llvm/IR/Attributes.h"
|
||||
#include "llvm/IR/CallingConv.h"
|
||||
#include "llvm/IR/Constants.h"
|
||||
#include "llvm/IR/DebugInfo.h"
|
||||
@ -46,6 +48,7 @@
|
||||
#include "llvm/Support/ErrorHandling.h"
|
||||
#include "llvm/Support/Regex.h"
|
||||
#include "llvm/TargetParser/Triple.h"
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <numeric>
|
||||
|
||||
@ -5021,6 +5024,43 @@ bool llvm::UpgradeDebugInfo(Module &M) {
|
||||
return Modified;
|
||||
}
|
||||
|
||||
static void upgradeNVVMFnVectorAttr(const StringRef Attr, const char DimC,
|
||||
GlobalValue *GV, const Metadata *V) {
|
||||
Function *F = cast<Function>(GV);
|
||||
|
||||
constexpr StringLiteral DefaultValue = "1";
|
||||
StringRef Vect3[3] = {DefaultValue, DefaultValue, DefaultValue};
|
||||
unsigned Length = 0;
|
||||
|
||||
if (F->hasFnAttribute(Attr)) {
|
||||
// We expect the existing attribute to have the form "x[,y[,z]]". Here we
|
||||
// parse these elements placing them into Vect3
|
||||
StringRef S = F->getFnAttribute(Attr).getValueAsString();
|
||||
for (; Length < 3 && !S.empty(); Length++) {
|
||||
auto [Part, Rest] = S.split(',');
|
||||
Vect3[Length] = Part.trim();
|
||||
S = Rest;
|
||||
}
|
||||
}
|
||||
|
||||
const unsigned Dim = DimC - 'x';
|
||||
assert(Dim >= 0 && Dim < 3 && "Unexpected dim char");
|
||||
|
||||
const uint64_t VInt = mdconst::extract<ConstantInt>(V)->getZExtValue();
|
||||
|
||||
// local variable required for StringRef in Vect3 to point to.
|
||||
const std::string VStr = llvm::utostr(VInt);
|
||||
Vect3[Dim] = VStr;
|
||||
Length = std::max(Length, Dim + 1);
|
||||
|
||||
const std::string NewAttr = llvm::join(ArrayRef(Vect3, Length), ",");
|
||||
F->addFnAttr(Attr, NewAttr);
|
||||
}
|
||||
|
||||
static inline bool isXYZ(StringRef S) {
|
||||
return S == "x" || S == "y" || S == "z";
|
||||
}
|
||||
|
||||
bool static upgradeSingleNVVMAnnotation(GlobalValue *GV, StringRef K,
|
||||
const Metadata *V) {
|
||||
if (K == "kernel") {
|
||||
@ -5059,6 +5099,18 @@ bool static upgradeSingleNVVMAnnotation(GlobalValue *GV, StringRef K,
|
||||
cast<Function>(GV)->addFnAttr("nvvm.maxnreg", llvm::utostr(CV));
|
||||
return true;
|
||||
}
|
||||
if (K.consume_front("maxntid") && isXYZ(K)) {
|
||||
upgradeNVVMFnVectorAttr("nvvm.maxntid", K[0], GV, V);
|
||||
return true;
|
||||
}
|
||||
if (K.consume_front("reqntid") && isXYZ(K)) {
|
||||
upgradeNVVMFnVectorAttr("nvvm.reqntid", K[0], GV, V);
|
||||
return true;
|
||||
}
|
||||
if (K.consume_front("cluster_dim_") && isXYZ(K)) {
|
||||
upgradeNVVMFnVectorAttr("nvvm.cluster_dim", K[0], GV, V);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -35,6 +35,7 @@
|
||||
#include "llvm/ADT/StringExtras.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/ADT/Twine.h"
|
||||
#include "llvm/ADT/iterator_range.h"
|
||||
#include "llvm/Analysis/ConstantFolding.h"
|
||||
#include "llvm/CodeGen/Analysis.h"
|
||||
#include "llvm/CodeGen/MachineBasicBlock.h"
|
||||
@ -427,24 +428,15 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
|
||||
// If the NVVM IR has some of reqntid* specified, then output
|
||||
// the reqntid directive, and set the unspecified ones to 1.
|
||||
// If none of Reqntid* is specified, don't output reqntid directive.
|
||||
std::optional<unsigned> Reqntidx = getReqNTIDx(F);
|
||||
std::optional<unsigned> Reqntidy = getReqNTIDy(F);
|
||||
std::optional<unsigned> Reqntidz = getReqNTIDz(F);
|
||||
const auto ReqNTID = getReqNTID(F);
|
||||
if (!ReqNTID.empty())
|
||||
O << formatv(".reqntid {0:$[, ]}\n",
|
||||
make_range(ReqNTID.begin(), ReqNTID.end()));
|
||||
|
||||
if (Reqntidx || Reqntidy || Reqntidz)
|
||||
O << ".reqntid " << Reqntidx.value_or(1) << ", " << Reqntidy.value_or(1)
|
||||
<< ", " << Reqntidz.value_or(1) << "\n";
|
||||
|
||||
// If the NVVM IR has some of maxntid* specified, then output
|
||||
// the maxntid directive, and set the unspecified ones to 1.
|
||||
// If none of maxntid* is specified, don't output maxntid directive.
|
||||
std::optional<unsigned> Maxntidx = getMaxNTIDx(F);
|
||||
std::optional<unsigned> Maxntidy = getMaxNTIDy(F);
|
||||
std::optional<unsigned> Maxntidz = getMaxNTIDz(F);
|
||||
|
||||
if (Maxntidx || Maxntidy || Maxntidz)
|
||||
O << ".maxntid " << Maxntidx.value_or(1) << ", " << Maxntidy.value_or(1)
|
||||
<< ", " << Maxntidz.value_or(1) << "\n";
|
||||
const auto MaxNTID = getMaxNTID(F);
|
||||
if (!MaxNTID.empty())
|
||||
O << formatv(".maxntid {0:$[, ]}\n",
|
||||
make_range(MaxNTID.begin(), MaxNTID.end()));
|
||||
|
||||
if (const auto Mincta = getMinCTASm(F))
|
||||
O << ".minnctapersm " << *Mincta << "\n";
|
||||
@ -458,21 +450,19 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
|
||||
const auto *STI = static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());
|
||||
|
||||
if (STI->getSmVersion() >= 90) {
|
||||
std::optional<unsigned> ClusterX = getClusterDimx(F);
|
||||
std::optional<unsigned> ClusterY = getClusterDimy(F);
|
||||
std::optional<unsigned> ClusterZ = getClusterDimz(F);
|
||||
const auto ClusterDim = getClusterDim(F);
|
||||
|
||||
if (ClusterX || ClusterY || ClusterZ) {
|
||||
if (!ClusterDim.empty()) {
|
||||
O << ".explicitcluster\n";
|
||||
if (ClusterX.value_or(1) != 0) {
|
||||
assert(ClusterY.value_or(1) && ClusterZ.value_or(1) &&
|
||||
if (ClusterDim[0] != 0) {
|
||||
assert(llvm::all_of(ClusterDim, [](unsigned D) { return D != 0; }) &&
|
||||
"cluster_dim_x != 0 implies cluster_dim_y and cluster_dim_z "
|
||||
"should be non-zero as well");
|
||||
|
||||
O << ".reqnctapercluster " << ClusterX.value_or(1) << ", "
|
||||
<< ClusterY.value_or(1) << ", " << ClusterZ.value_or(1) << "\n";
|
||||
O << formatv(".reqnctapercluster {0:$[, ]}\n",
|
||||
make_range(ClusterDim.begin(), ClusterDim.end()));
|
||||
} else {
|
||||
assert(!ClusterY.value_or(1) && !ClusterZ.value_or(1) &&
|
||||
assert(llvm::all_of(ClusterDim, [](unsigned D) { return D == 0; }) &&
|
||||
"cluster_dim_x == 0 implies cluster_dim_y and cluster_dim_z "
|
||||
"should be 0 as well");
|
||||
}
|
||||
|
||||
@ -50,33 +50,10 @@ static std::string getHash(StringRef Str) {
|
||||
return llvm::utohexstr(Hash.low(), /*LowerCase=*/true);
|
||||
}
|
||||
|
||||
static void addKernelMetadata(Module &M, Function *F) {
|
||||
llvm::LLVMContext &Ctx = M.getContext();
|
||||
|
||||
// Get "nvvm.annotations" metadata node.
|
||||
llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
|
||||
|
||||
// This kernel is only to be called single-threaded.
|
||||
llvm::Metadata *ThreadXMDVals[] = {
|
||||
llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidx"),
|
||||
llvm::ConstantAsMetadata::get(
|
||||
llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
|
||||
llvm::Metadata *ThreadYMDVals[] = {
|
||||
llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidy"),
|
||||
llvm::ConstantAsMetadata::get(
|
||||
llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
|
||||
llvm::Metadata *ThreadZMDVals[] = {
|
||||
llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidz"),
|
||||
llvm::ConstantAsMetadata::get(
|
||||
llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
|
||||
|
||||
static void addKernelAttrs(Function *F) {
|
||||
F->addFnAttr("nvvm.maxclusterrank", "1");
|
||||
F->addFnAttr("nvvm.maxntid", "1");
|
||||
F->setCallingConv(CallingConv::PTX_Kernel);
|
||||
|
||||
// Append metadata to nvvm.annotations.
|
||||
MD->addOperand(llvm::MDNode::get(Ctx, ThreadXMDVals));
|
||||
MD->addOperand(llvm::MDNode::get(Ctx, ThreadYMDVals));
|
||||
MD->addOperand(llvm::MDNode::get(Ctx, ThreadZMDVals));
|
||||
}
|
||||
|
||||
static Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) {
|
||||
@ -88,7 +65,7 @@ static Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) {
|
||||
Function *InitOrFiniKernel = Function::createWithDefaultAttr(
|
||||
FunctionType::get(Type::getVoidTy(M.getContext()), false),
|
||||
GlobalValue::WeakODRLinkage, 0, InitOrFiniKernelName, &M);
|
||||
addKernelMetadata(M, InitOrFiniKernel);
|
||||
addKernelAttrs(InitOrFiniKernel);
|
||||
|
||||
return InitOrFiniKernel;
|
||||
}
|
||||
|
||||
@ -575,13 +575,14 @@ unsigned NVPTXTTIImpl::getAssumedAddrSpace(const Value *V) const {
|
||||
void NVPTXTTIImpl::collectKernelLaunchBounds(
|
||||
const Function &F,
|
||||
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
|
||||
std::optional<unsigned> Val;
|
||||
if ((Val = getMaxClusterRank(F)))
|
||||
if (const auto Val = getMaxClusterRank(F))
|
||||
LB.push_back({"maxclusterrank", *Val});
|
||||
if ((Val = getMaxNTIDx(F)))
|
||||
LB.push_back({"maxntidx", *Val});
|
||||
if ((Val = getMaxNTIDy(F)))
|
||||
LB.push_back({"maxntidy", *Val});
|
||||
if ((Val = getMaxNTIDz(F)))
|
||||
LB.push_back({"maxntidz", *Val});
|
||||
|
||||
const auto MaxNTID = getMaxNTID(F);
|
||||
if (MaxNTID.size() > 0)
|
||||
LB.push_back({"maxntidx", MaxNTID[0]});
|
||||
if (MaxNTID.size() > 1)
|
||||
LB.push_back({"maxntidy", MaxNTID[1]});
|
||||
if (MaxNTID.size() > 2)
|
||||
LB.push_back({"maxntidz", MaxNTID[2]});
|
||||
}
|
||||
|
||||
@ -13,6 +13,8 @@
|
||||
#include "NVPTXUtilities.h"
|
||||
#include "NVPTX.h"
|
||||
#include "NVPTXTargetMachine.h"
|
||||
#include "llvm/ADT/ArrayRef.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/IR/Constants.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
@ -20,6 +22,7 @@
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/Support/Alignment.h"
|
||||
#include "llvm/Support/Mutex.h"
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
@ -196,6 +199,35 @@ static std::optional<unsigned> getFnAttrParsedInt(const Function &F,
|
||||
: std::nullopt;
|
||||
}
|
||||
|
||||
static SmallVector<unsigned, 3> getFnAttrParsedVector(const Function &F,
|
||||
StringRef Attr) {
|
||||
SmallVector<unsigned, 3> V;
|
||||
auto &Ctx = F.getContext();
|
||||
|
||||
if (F.hasFnAttribute(Attr)) {
|
||||
// We expect the attribute value to be of the form "x[,y[,z]]", where x, y,
|
||||
// and z are unsigned values.
|
||||
StringRef S = F.getFnAttribute(Attr).getValueAsString();
|
||||
for (unsigned I = 0; I < 3 && !S.empty(); I++) {
|
||||
auto [First, Rest] = S.split(",");
|
||||
unsigned IntVal;
|
||||
if (First.trim().getAsInteger(0, IntVal))
|
||||
Ctx.emitError("can't parse integer attribute " + First + " in " + Attr);
|
||||
|
||||
V.push_back(IntVal);
|
||||
S = Rest;
|
||||
}
|
||||
}
|
||||
return V;
|
||||
}
|
||||
|
||||
static std::optional<uint64_t> getVectorProduct(ArrayRef<unsigned> V) {
|
||||
if (V.empty())
|
||||
return std::nullopt;
|
||||
|
||||
return std::accumulate(V.begin(), V.end(), 1, std::multiplies<uint64_t>{});
|
||||
}
|
||||
|
||||
bool isParamGridConstant(const Value &V) {
|
||||
if (const Argument *Arg = dyn_cast<Argument>(&V)) {
|
||||
// "grid_constant" counts argument indices starting from 1
|
||||
@ -254,71 +286,39 @@ StringRef getSamplerName(const Value &V) {
|
||||
return V.getName();
|
||||
}
|
||||
|
||||
std::optional<unsigned> getMaxNTIDx(const Function &F) {
|
||||
return findOneNVVMAnnotation(&F, "maxntidx");
|
||||
SmallVector<unsigned, 3> getMaxNTID(const Function &F) {
|
||||
return getFnAttrParsedVector(F, "nvvm.maxntid");
|
||||
}
|
||||
|
||||
std::optional<unsigned> getMaxNTIDy(const Function &F) {
|
||||
return findOneNVVMAnnotation(&F, "maxntidy");
|
||||
SmallVector<unsigned, 3> getReqNTID(const Function &F) {
|
||||
return getFnAttrParsedVector(F, "nvvm.reqntid");
|
||||
}
|
||||
|
||||
std::optional<unsigned> getMaxNTIDz(const Function &F) {
|
||||
return findOneNVVMAnnotation(&F, "maxntidz");
|
||||
SmallVector<unsigned, 3> getClusterDim(const Function &F) {
|
||||
return getFnAttrParsedVector(F, "nvvm.cluster_dim");
|
||||
}
|
||||
|
||||
std::optional<unsigned> getMaxNTID(const Function &F) {
|
||||
std::optional<uint64_t> getOverallMaxNTID(const Function &F) {
|
||||
// Note: The semantics here are a bit strange. The PTX ISA states the
|
||||
// following (11.4.2. Performance-Tuning Directives: .maxntid):
|
||||
//
|
||||
// Note that this directive guarantees that the total number of threads does
|
||||
// not exceed the maximum, but does not guarantee that the limit in any
|
||||
// particular dimension is not exceeded.
|
||||
std::optional<unsigned> MaxNTIDx = getMaxNTIDx(F);
|
||||
std::optional<unsigned> MaxNTIDy = getMaxNTIDy(F);
|
||||
std::optional<unsigned> MaxNTIDz = getMaxNTIDz(F);
|
||||
if (MaxNTIDx || MaxNTIDy || MaxNTIDz)
|
||||
return MaxNTIDx.value_or(1) * MaxNTIDy.value_or(1) * MaxNTIDz.value_or(1);
|
||||
return std::nullopt;
|
||||
const auto MaxNTID = getMaxNTID(F);
|
||||
return getVectorProduct(MaxNTID);
|
||||
}
|
||||
|
||||
std::optional<unsigned> getClusterDimx(const Function &F) {
|
||||
return findOneNVVMAnnotation(&F, "cluster_dim_x");
|
||||
}
|
||||
|
||||
std::optional<unsigned> getClusterDimy(const Function &F) {
|
||||
return findOneNVVMAnnotation(&F, "cluster_dim_y");
|
||||
}
|
||||
|
||||
std::optional<unsigned> getClusterDimz(const Function &F) {
|
||||
return findOneNVVMAnnotation(&F, "cluster_dim_z");
|
||||
std::optional<uint64_t> getOverallReqNTID(const Function &F) {
|
||||
// Note: The semantics here are a bit strange. See getMaxNTID.
|
||||
const auto ReqNTID = getReqNTID(F);
|
||||
return getVectorProduct(ReqNTID);
|
||||
}
|
||||
|
||||
std::optional<unsigned> getMaxClusterRank(const Function &F) {
|
||||
return getFnAttrParsedInt(F, "nvvm.maxclusterrank");
|
||||
}
|
||||
|
||||
std::optional<unsigned> getReqNTIDx(const Function &F) {
|
||||
return findOneNVVMAnnotation(&F, "reqntidx");
|
||||
}
|
||||
|
||||
std::optional<unsigned> getReqNTIDy(const Function &F) {
|
||||
return findOneNVVMAnnotation(&F, "reqntidy");
|
||||
}
|
||||
|
||||
std::optional<unsigned> getReqNTIDz(const Function &F) {
|
||||
return findOneNVVMAnnotation(&F, "reqntidz");
|
||||
}
|
||||
|
||||
std::optional<unsigned> getReqNTID(const Function &F) {
|
||||
// Note: The semantics here are a bit strange. See getMaxNTID.
|
||||
std::optional<unsigned> ReqNTIDx = getReqNTIDx(F);
|
||||
std::optional<unsigned> ReqNTIDy = getReqNTIDy(F);
|
||||
std::optional<unsigned> ReqNTIDz = getReqNTIDz(F);
|
||||
if (ReqNTIDx || ReqNTIDy || ReqNTIDz)
|
||||
return ReqNTIDx.value_or(1) * ReqNTIDy.value_or(1) * ReqNTIDz.value_or(1);
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
std::optional<unsigned> getMinCTASm(const Function &F) {
|
||||
return getFnAttrParsedInt(F, "nvvm.minctasm");
|
||||
}
|
||||
|
||||
@ -14,6 +14,7 @@
|
||||
#define LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H
|
||||
|
||||
#include "NVPTX.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/ADT/StringExtras.h"
|
||||
#include "llvm/CodeGen/ValueTypes.h"
|
||||
#include "llvm/IR/CallingConv.h"
|
||||
@ -47,19 +48,12 @@ StringRef getTextureName(const Value &);
|
||||
StringRef getSurfaceName(const Value &);
|
||||
StringRef getSamplerName(const Value &);
|
||||
|
||||
std::optional<unsigned> getMaxNTIDx(const Function &);
|
||||
std::optional<unsigned> getMaxNTIDy(const Function &);
|
||||
std::optional<unsigned> getMaxNTIDz(const Function &);
|
||||
std::optional<unsigned> getMaxNTID(const Function &);
|
||||
SmallVector<unsigned, 3> getMaxNTID(const Function &);
|
||||
SmallVector<unsigned, 3> getReqNTID(const Function &);
|
||||
SmallVector<unsigned, 3> getClusterDim(const Function &);
|
||||
|
||||
std::optional<unsigned> getReqNTIDx(const Function &);
|
||||
std::optional<unsigned> getReqNTIDy(const Function &);
|
||||
std::optional<unsigned> getReqNTIDz(const Function &);
|
||||
std::optional<unsigned> getReqNTID(const Function &);
|
||||
|
||||
std::optional<unsigned> getClusterDimx(const Function &);
|
||||
std::optional<unsigned> getClusterDimy(const Function &);
|
||||
std::optional<unsigned> getClusterDimz(const Function &);
|
||||
std::optional<uint64_t> getOverallMaxNTID(const Function &);
|
||||
std::optional<uint64_t> getOverallReqNTID(const Function &);
|
||||
|
||||
std::optional<unsigned> getMaxClusterRank(const Function &);
|
||||
std::optional<unsigned> getMinCTASm(const Function &);
|
||||
|
||||
@ -67,8 +67,8 @@ static bool runNVVMIntrRange(Function &F) {
|
||||
unsigned x, y, z;
|
||||
} MaxBlockSize, MaxGridSize;
|
||||
|
||||
const unsigned MetadataNTID = getReqNTID(F).value_or(
|
||||
getMaxNTID(F).value_or(std::numeric_limits<unsigned>::max()));
|
||||
const unsigned MetadataNTID = getOverallReqNTID(F).value_or(
|
||||
getOverallMaxNTID(F).value_or(std::numeric_limits<unsigned>::max()));
|
||||
|
||||
MaxBlockSize.x = std::min(1024u, MetadataNTID);
|
||||
MaxBlockSize.y = std::min(1024u, MetadataNTID);
|
||||
|
||||
@ -24,11 +24,11 @@ attributes #0 = {
|
||||
"omp_target_num_teams"="100"
|
||||
"omp_target_thread_limit"="101"
|
||||
"nvvm.maxclusterrank"="200"
|
||||
"nvvm.maxntid"="210,211,212"
|
||||
}
|
||||
|
||||
!llvm.module.flags = !{!0}
|
||||
!llvm.dbg.cu = !{!1}
|
||||
!nvvm.annotations = !{!7, !8, !9, !10}
|
||||
|
||||
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
||||
!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
|
||||
@ -36,7 +36,3 @@ attributes #0 = {
|
||||
!3 = !{}
|
||||
!4 = !DISubroutineType(types: !3)
|
||||
!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3)
|
||||
!7 = !{ptr @test, !"maxntidx", i32 210}
|
||||
!8 = !{ptr @test, !"maxntidy", i32 211}
|
||||
!9 = !{ptr @test, !"maxntidz", i32 212}
|
||||
!10 = distinct !{ptr null, !"kernel", i32 1}
|
||||
|
||||
@ -9,14 +9,14 @@
|
||||
; CHECK: .global .surfref surface
|
||||
|
||||
; CHECK: .entry kernel_func_maxntid
|
||||
define void @kernel_func_maxntid(ptr %a) {
|
||||
define ptx_kernel void @kernel_func_maxntid(ptr %a) "nvvm.maxntid"="10,20,30" {
|
||||
; CHECK: .maxntid 10, 20, 30
|
||||
; CHECK: ret
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: .entry kernel_func_reqntid
|
||||
define void @kernel_func_reqntid(ptr %a) {
|
||||
define ptx_kernel void @kernel_func_reqntid(ptr %a) "nvvm.reqntid"="11,22,33" {
|
||||
; CHECK: .reqntid 11, 22, 33
|
||||
; CHECK: ret
|
||||
ret void
|
||||
@ -36,13 +36,7 @@ define ptx_kernel void @kernel_func_maxnreg() "nvvm.maxnreg"="1234" {
|
||||
ret void
|
||||
}
|
||||
|
||||
!nvvm.annotations = !{!1, !2, !3, !4, !9, !10}
|
||||
|
||||
!1 = !{ptr @kernel_func_maxntid, !"kernel", i32 1}
|
||||
!2 = !{ptr @kernel_func_maxntid, !"maxntidx", i32 10, !"maxntidy", i32 20, !"maxntidz", i32 30}
|
||||
|
||||
!3 = !{ptr @kernel_func_reqntid, !"kernel", i32 1}
|
||||
!4 = !{ptr @kernel_func_reqntid, !"reqntidx", i32 11, !"reqntidy", i32 22, !"reqntidz", i32 33}
|
||||
!nvvm.annotations = !{!9, !10}
|
||||
|
||||
!9 = !{ptr addrspace(1) @texture, !"texture", i32 1}
|
||||
!10 = !{ptr addrspace(1) @surface, !"surface", i32 1}
|
||||
|
||||
@ -26,8 +26,4 @@ bb:
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { norecurse nounwind "polly.skip.fn" }
|
||||
|
||||
!nvvm.annotations = !{!0}
|
||||
|
||||
!0 = !{ptr @spam, !"maxntidx", i64 1, !"maxntidy", i64 1, !"maxntidz", i64 1}
|
||||
attributes #0 = { norecurse nounwind "polly.skip.fn" "nvvm.maxntid"="1,1,1" }
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 | FileCheck -check-prefixes=CHECK90 %s
|
||||
; RUN: %if ptxas-12.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 | %ptxas-verify -arch=sm_90 %}
|
||||
|
||||
define ptx_kernel void @kernel_func_clusterxyz() {
|
||||
define ptx_kernel void @kernel_func_clusterxyz() "nvvm.cluster_dim"="3,5,7" {
|
||||
; CHECK80-LABEL: kernel_func_clusterxyz(
|
||||
; CHECK80: {
|
||||
; CHECK80-EMPTY:
|
||||
@ -21,8 +21,3 @@ define ptx_kernel void @kernel_func_clusterxyz() {
|
||||
; CHECK90-NEXT: ret;
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
!nvvm.annotations = !{!1}
|
||||
|
||||
!1 = !{ptr @kernel_func_clusterxyz, !"cluster_dim_x", i32 3, !"cluster_dim_y", i32 5, !"cluster_dim_z", i32 7}
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --version 5
|
||||
; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -mcpu=sm_20 -passes=nvvm-intr-range | FileCheck %s
|
||||
|
||||
define ptx_kernel i32 @test_maxntid() {
|
||||
define ptx_kernel i32 @test_maxntid() "nvvm.maxntid"="32,1,3" {
|
||||
; CHECK-LABEL: define ptx_kernel i32 @test_maxntid(
|
||||
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 96) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
@ -31,9 +31,9 @@ define ptx_kernel i32 @test_maxntid() {
|
||||
ret i32 %11
|
||||
}
|
||||
|
||||
define ptx_kernel i32 @test_reqntid() {
|
||||
define ptx_kernel i32 @test_reqntid() "nvvm.reqntid"="20" {
|
||||
; CHECK-LABEL: define ptx_kernel i32 @test_reqntid(
|
||||
; CHECK-SAME: ) #[[ATTR0]] {
|
||||
; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.z()
|
||||
@ -64,9 +64,9 @@ define ptx_kernel i32 @test_reqntid() {
|
||||
;; A case like this could occur if a function with the sreg intrinsic was
|
||||
;; inlined into a kernel where the tid metadata is present, ensure the range is
|
||||
;; updated.
|
||||
define ptx_kernel i32 @test_inlined() {
|
||||
define ptx_kernel i32 @test_inlined() "nvvm.maxntid"="4" {
|
||||
; CHECK-LABEL: define ptx_kernel i32 @test_inlined(
|
||||
; CHECK-SAME: ) #[[ATTR0]] {
|
||||
; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 4) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
; CHECK-NEXT: ret i32 [[TMP1]]
|
||||
;
|
||||
@ -81,8 +81,3 @@ declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
|
||||
|
||||
!nvvm.annotations = !{!0, !1, !2}
|
||||
!0 = !{ptr @test_maxntid, !"maxntidx", i32 32, !"maxntidz", i32 3}
|
||||
!1 = !{ptr @test_reqntid, !"reqntidx", i32 20}
|
||||
!2 = !{ptr @test_inlined, !"maxntidx", i32 4}
|
||||
|
||||
@ -84,11 +84,4 @@ define internal void @bar() {
|
||||
; CHECK: while.end:
|
||||
; CHECK-NEXT: ret void
|
||||
|
||||
; CHECK: attributes #[[ATTR0]] = { "nvvm.maxclusterrank"="1" }
|
||||
|
||||
; CHECK: [[META1:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidx", i32 1}
|
||||
; CHECK: [[META2:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidy", i32 1}
|
||||
; CHECK: [[META3:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidz", i32 1}
|
||||
; CHECK: [[META4:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidx", i32 1}
|
||||
; CHECK: [[META5:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidy", i32 1}
|
||||
; CHECK: [[META6:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidz", i32 1}
|
||||
; CHECK: attributes #[[ATTR0]] = { "nvvm.maxclusterrank"="1" "nvvm.maxntid"="1" }
|
||||
|
||||
@ -4,20 +4,16 @@
|
||||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
|
||||
target triple = "nvptx64-unknown-unknown"
|
||||
|
||||
; CHECK: .maxntid 128, 1, 1
|
||||
; CHECK: .maxntid 128
|
||||
; CHECK: .minnctapersm 2
|
||||
; CHECK_SM_90: .maxclusterrank 8
|
||||
; CHECK_SM_80-NOT: .maxclusterrank 8
|
||||
|
||||
; Make sure that for SM version prior to 90 `.maxclusterrank` directive is
|
||||
; silently ignored.
|
||||
define dso_local ptx_kernel void @_Z18TestMaxClusterRankv() "nvvm.minctasm"="2" "nvvm.maxclusterrank"="8" {
|
||||
define dso_local ptx_kernel void @_Z18TestMaxClusterRankv() "nvvm.minctasm"="2" "nvvm.maxclusterrank"="8" "nvvm.maxntid"="128" {
|
||||
entry:
|
||||
%a = alloca i32, align 4
|
||||
store volatile i32 1, ptr %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
!nvvm.annotations = !{!1}
|
||||
|
||||
!1 = !{ptr @_Z18TestMaxClusterRankv, !"maxntidx", i32 128}
|
||||
|
||||
@ -48,7 +48,55 @@ define void @test_maxnreg() {
|
||||
ret void
|
||||
}
|
||||
|
||||
!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6}
|
||||
define void @test_maxntid_1() {
|
||||
; CHECK-LABEL: define void @test_maxntid_1(
|
||||
; CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_maxntid_2() {
|
||||
; CHECK-LABEL: define void @test_maxntid_2(
|
||||
; CHECK-SAME: ) #[[ATTR5:[0-9]+]] {
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_maxntid_3() {
|
||||
; CHECK-LABEL: define void @test_maxntid_3(
|
||||
; CHECK-SAME: ) #[[ATTR6:[0-9]+]] {
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_maxntid_4() {
|
||||
; CHECK-LABEL: define void @test_maxntid_4(
|
||||
; CHECK-SAME: ) #[[ATTR7:[0-9]+]] {
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_reqntid() {
|
||||
; CHECK-LABEL: define void @test_reqntid(
|
||||
; CHECK-SAME: ) #[[ATTR8:[0-9]+]] {
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_cluster_dim() {
|
||||
; CHECK-LABEL: define void @test_cluster_dim(
|
||||
; CHECK-SAME: ) #[[ATTR9:[0-9]+]] {
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
ret void
|
||||
}
|
||||
|
||||
!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12}
|
||||
|
||||
!0 = !{ptr @test_align, !"align", i32 u0x00000008, !"align", i32 u0x00010008, !"align", i32 u0x00020010}
|
||||
!1 = !{null, !"align", i32 u0x00000008, !"align", i32 u0x00010008, !"align", i32 u0x00020008}
|
||||
@ -57,12 +105,24 @@ define void @test_maxnreg() {
|
||||
!4 = !{ptr @test_cluster_max_blocks, !"cluster_max_blocks", i32 3}
|
||||
!5 = !{ptr @test_minctasm, !"minctasm", i32 4}
|
||||
!6 = !{ptr @test_maxnreg, !"maxnreg", i32 5}
|
||||
!7 = !{ptr @test_maxntid_1, !"maxntidx", i32 50}
|
||||
!8 = !{ptr @test_maxntid_2, !"maxntidx", i32 11, !"maxntidy", i32 22, !"maxntidz", i32 33}
|
||||
!9 = !{ptr @test_maxntid_3, !"maxntidz", i32 11, !"maxntidy", i32 22, !"maxntidx", i32 33}
|
||||
!10 = !{ptr @test_maxntid_4, !"maxntidz", i32 100}
|
||||
!11 = !{ptr @test_reqntid, !"reqntidx", i32 31, !"reqntidy", i32 32, !"reqntidz", i32 33}
|
||||
!12 = !{ptr @test_cluster_dim, !"cluster_dim_x", i32 101, !"cluster_dim_y", i32 102, !"cluster_dim_z", i32 103}
|
||||
|
||||
;.
|
||||
; CHECK: attributes #[[ATTR0]] = { "nvvm.maxclusterrank"="2" }
|
||||
; CHECK: attributes #[[ATTR1]] = { "nvvm.maxclusterrank"="3" }
|
||||
; CHECK: attributes #[[ATTR2]] = { "nvvm.minctasm"="4" }
|
||||
; CHECK: attributes #[[ATTR3]] = { "nvvm.maxnreg"="5" }
|
||||
; CHECK: attributes #[[ATTR4]] = { "nvvm.maxntid"="50" }
|
||||
; CHECK: attributes #[[ATTR5]] = { "nvvm.maxntid"="11,22,33" }
|
||||
; CHECK: attributes #[[ATTR6]] = { "nvvm.maxntid"="33,22,11" }
|
||||
; CHECK: attributes #[[ATTR7]] = { "nvvm.maxntid"="1,1,100" }
|
||||
; CHECK: attributes #[[ATTR8]] = { "nvvm.reqntid"="31,32,33" }
|
||||
; CHECK: attributes #[[ATTR9]] = { "nvvm.cluster_dim"="101,102,103" }
|
||||
;.
|
||||
; CHECK: [[META0:![0-9]+]] = !{ptr @test_align, !"align", i32 8}
|
||||
;.
|
||||
|
||||
@ -18,8 +18,10 @@
|
||||
#include "mlir/Target/LLVMIR/ModuleTranslation.h"
|
||||
|
||||
#include "llvm/ADT/StringExtras.h"
|
||||
#include "llvm/ADT/iterator_range.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/IntrinsicsNVPTX.h"
|
||||
#include "llvm/Support/FormatVariadic.h"
|
||||
|
||||
using namespace mlir;
|
||||
using namespace mlir::LLVM;
|
||||
@ -195,48 +197,33 @@ public:
|
||||
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
|
||||
if (!func)
|
||||
return failure();
|
||||
llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext();
|
||||
llvm::Function *llvmFunc = moduleTranslation.lookupFunction(func.getName());
|
||||
|
||||
auto generateMetadata = [&](int dim, StringRef name) {
|
||||
llvm::Metadata *llvmMetadata[] = {
|
||||
llvm::ValueAsMetadata::get(llvmFunc),
|
||||
llvm::MDString::get(llvmContext, name),
|
||||
llvm::ValueAsMetadata::get(llvm::ConstantInt::get(
|
||||
llvm::Type::getInt32Ty(llvmContext), dim))};
|
||||
llvm::MDNode *llvmMetadataNode =
|
||||
llvm::MDNode::get(llvmContext, llvmMetadata);
|
||||
moduleTranslation.getOrInsertNamedModuleMetadata("nvvm.annotations")
|
||||
->addOperand(llvmMetadataNode);
|
||||
};
|
||||
if (attribute.getName() == NVVM::NVVMDialect::getMaxntidAttrName()) {
|
||||
if (!dyn_cast<DenseI32ArrayAttr>(attribute.getValue()))
|
||||
return failure();
|
||||
auto values = cast<DenseI32ArrayAttr>(attribute.getValue());
|
||||
generateMetadata(values[0], NVVM::NVVMDialect::getMaxntidXName());
|
||||
if (values.size() > 1)
|
||||
generateMetadata(values[1], NVVM::NVVMDialect::getMaxntidYName());
|
||||
if (values.size() > 2)
|
||||
generateMetadata(values[2], NVVM::NVVMDialect::getMaxntidZName());
|
||||
const std::string attr = llvm::formatv(
|
||||
"{0:$[,]}", llvm::make_range(values.asArrayRef().begin(),
|
||||
values.asArrayRef().end()));
|
||||
llvmFunc->addFnAttr("nvvm.maxntid", attr);
|
||||
} else if (attribute.getName() == NVVM::NVVMDialect::getReqntidAttrName()) {
|
||||
if (!dyn_cast<DenseI32ArrayAttr>(attribute.getValue()))
|
||||
return failure();
|
||||
auto values = cast<DenseI32ArrayAttr>(attribute.getValue());
|
||||
generateMetadata(values[0], NVVM::NVVMDialect::getReqntidXName());
|
||||
if (values.size() > 1)
|
||||
generateMetadata(values[1], NVVM::NVVMDialect::getReqntidYName());
|
||||
if (values.size() > 2)
|
||||
generateMetadata(values[2], NVVM::NVVMDialect::getReqntidZName());
|
||||
const std::string attr = llvm::formatv(
|
||||
"{0:$[,]}", llvm::make_range(values.asArrayRef().begin(),
|
||||
values.asArrayRef().end()));
|
||||
llvmFunc->addFnAttr("nvvm.reqntid", attr);
|
||||
} else if (attribute.getName() ==
|
||||
NVVM::NVVMDialect::getClusterDimAttrName()) {
|
||||
if (!dyn_cast<DenseI32ArrayAttr>(attribute.getValue()))
|
||||
return failure();
|
||||
auto values = cast<DenseI32ArrayAttr>(attribute.getValue());
|
||||
generateMetadata(values[0], NVVM::NVVMDialect::getClusterDimXName());
|
||||
if (values.size() > 1)
|
||||
generateMetadata(values[1], NVVM::NVVMDialect::getClusterDimYName());
|
||||
if (values.size() > 2)
|
||||
generateMetadata(values[2], NVVM::NVVMDialect::getClusterDimZName());
|
||||
const std::string attr = llvm::formatv(
|
||||
"{0:$[,]}", llvm::make_range(values.asArrayRef().begin(),
|
||||
values.asArrayRef().end()));
|
||||
llvmFunc->addFnAttr("nvvm.cluster_dim", attr);
|
||||
} else if (attribute.getName() ==
|
||||
NVVM::NVVMDialect::getClusterMaxBlocksAttrName()) {
|
||||
auto value = dyn_cast<IntegerAttr>(attribute.getValue());
|
||||
|
||||
@ -590,33 +590,24 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.maxntid = array<i32: 1, 2
|
||||
llvm.return
|
||||
}
|
||||
|
||||
// CHECK: define ptx_kernel void @kernel_func
|
||||
// CHECK: !nvvm.annotations =
|
||||
// CHECK: {ptr @kernel_func, !"maxntidx", i32 1}
|
||||
// CHECK: {ptr @kernel_func, !"maxntidy", i32 23}
|
||||
// CHECK: {ptr @kernel_func, !"maxntidz", i32 32}
|
||||
// CHECK: define ptx_kernel void @kernel_func() #[[ATTR0:[0-9]+]]
|
||||
// CHECK: attributes #[[ATTR0]] = { "nvvm.maxntid"="1,23,32" }
|
||||
// -----
|
||||
|
||||
llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.reqntid = array<i32: 1, 23, 32>} {
|
||||
llvm.return
|
||||
}
|
||||
|
||||
// CHECK: define ptx_kernel void @kernel_func
|
||||
// CHECK: !nvvm.annotations =
|
||||
// CHECK: {ptr @kernel_func, !"reqntidx", i32 1}
|
||||
// CHECK: {ptr @kernel_func, !"reqntidy", i32 23}
|
||||
// CHECK: {ptr @kernel_func, !"reqntidz", i32 32}
|
||||
// CHECK: define ptx_kernel void @kernel_func() #[[ATTR0:[0-9]+]]
|
||||
// CHECK: attributes #[[ATTR0]] = { "nvvm.reqntid"="1,23,32" }
|
||||
// -----
|
||||
|
||||
llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.cluster_dim = array<i32: 3, 5, 7>} {
|
||||
llvm.return
|
||||
}
|
||||
|
||||
// CHECK: define ptx_kernel void @kernel_func
|
||||
// CHECK: !nvvm.annotations =
|
||||
// CHECK: {ptr @kernel_func, !"cluster_dim_x", i32 3}
|
||||
// CHECK: {ptr @kernel_func, !"cluster_dim_y", i32 5}
|
||||
// CHECK: {ptr @kernel_func, !"cluster_dim_z", i32 7}
|
||||
// CHECK: define ptx_kernel void @kernel_func() #[[ATTR0:[0-9]+]]
|
||||
// CHECK: attributes #[[ATTR0]] = { "nvvm.cluster_dim"="3,5,7" }
|
||||
// -----
|
||||
|
||||
llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.cluster_max_blocks = 8} {
|
||||
@ -650,11 +641,7 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.maxntid = array<i32: 1, 2
|
||||
}
|
||||
|
||||
// CHECK: define ptx_kernel void @kernel_func() #[[ATTR0:[0-9]+]]
|
||||
// CHECK: attributes #[[ATTR0]] = { "nvvm.maxnreg"="32" "nvvm.minctasm"="16" }
|
||||
// CHECK: !nvvm.annotations =
|
||||
// CHECK: {ptr @kernel_func, !"maxntidx", i32 1}
|
||||
// CHECK: {ptr @kernel_func, !"maxntidy", i32 23}
|
||||
// CHECK: {ptr @kernel_func, !"maxntidz", i32 32}
|
||||
// CHECK: attributes #[[ATTR0]] = { "nvvm.maxnreg"="32" "nvvm.maxntid"="1,23,32" "nvvm.minctasm"="16" }
|
||||
|
||||
// -----
|
||||
// CHECK: define ptx_kernel void @kernel_func
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user