AMDGPU: Annotate group size ABI loads with range metadata (#185420)

We previously did the same for the grid size when annotated.
The group size is easier, so it's weird that this wasn't implemented
first.
This commit is contained in:
Matt Arsenault 2026-03-09 19:11:59 +01:00 committed by GitHub
parent f1a2fd2abb
commit 76daf31b40
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 193 additions and 57 deletions

View File

@ -82,18 +82,38 @@ Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
} // end anonymous namespace
static void annotateGridSizeLoadWithRangeMD(LoadInst *Load,
static bool annotateGridSizeLoadWithRangeMD(LoadInst *Load,
uint32_t MaxNumGroups) {
if (MaxNumGroups == 0 || MaxNumGroups == std::numeric_limits<uint32_t>::max())
return;
return false;
if (!Load->getType()->isIntegerTy(32))
return;
return false;
// TODO: If there is existing range metadata, preserve it if it is stricter.
if (Load->hasMetadata(LLVMContext::MD_range))
return false;
MDBuilder MDB(Load->getContext());
MDNode *Range = MDB.createRange(APInt(32, 1), APInt(32, MaxNumGroups + 1));
Load->setMetadata(LLVMContext::MD_range, Range);
return true;
}
static bool annotateGroupSizeLoadWithRangeMD(LoadInst *Load, bool IsRemainder) {
if (!Load->getType()->isIntegerTy(16))
return false;
// TODO: If there is existing range metadata, preserve it if it is stricter.
if (Load->hasMetadata(LLVMContext::MD_range))
return false;
MDBuilder MDB(Load->getContext());
MDNode *Range = MDB.createRange(
APInt(16, !IsRemainder),
APInt(16, AMDGPU::IsaInfo::getMaxFlatWorkGroupSize() - IsRemainder));
Load->setMetadata(LLVMContext::MD_range, Range);
return true;
}
static bool processUse(CallInst *CI, bool IsV5OrAbove) {
@ -109,18 +129,13 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups",
/*Size=*/3, /*DefaultVal=*/0);
if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize &&
!Intrinsic::getDeclarationIfExists(CI->getModule(),
Intrinsic::amdgcn_dispatch_ptr) &&
none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; }))
return false;
Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
Value *GroupSizes[3] = {nullptr, nullptr, nullptr};
Value *Remainders[3] = {nullptr, nullptr, nullptr};
Value *GridSizes[3] = {nullptr, nullptr, nullptr};
const DataLayout &DL = F->getDataLayout();
bool MadeChange = false;
// We expect to see several GEP users, casted to the appropriate type and
// loaded.
@ -155,44 +170,59 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
case HIDDEN_BLOCK_COUNT_X:
if (LoadSize == 4) {
BlockCounts[0] = Load;
annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]);
MadeChange |=
annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]);
}
break;
case HIDDEN_BLOCK_COUNT_Y:
if (LoadSize == 4) {
BlockCounts[1] = Load;
annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]);
MadeChange |=
annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]);
}
break;
case HIDDEN_BLOCK_COUNT_Z:
if (LoadSize == 4) {
BlockCounts[2] = Load;
annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]);
MadeChange |=
annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]);
}
break;
case HIDDEN_GROUP_SIZE_X:
if (LoadSize == 2)
if (LoadSize == 2) {
GroupSizes[0] = Load;
MadeChange |= annotateGroupSizeLoadWithRangeMD(Load, false);
}
break;
case HIDDEN_GROUP_SIZE_Y:
if (LoadSize == 2)
if (LoadSize == 2) {
GroupSizes[1] = Load;
MadeChange |= annotateGroupSizeLoadWithRangeMD(Load, false);
}
break;
case HIDDEN_GROUP_SIZE_Z:
if (LoadSize == 2)
if (LoadSize == 2) {
GroupSizes[2] = Load;
MadeChange |= annotateGroupSizeLoadWithRangeMD(Load, false);
}
break;
case HIDDEN_REMAINDER_X:
if (LoadSize == 2)
if (LoadSize == 2) {
Remainders[0] = Load;
MadeChange |= annotateGroupSizeLoadWithRangeMD(Load, true);
}
break;
case HIDDEN_REMAINDER_Y:
if (LoadSize == 2)
if (LoadSize == 2) {
Remainders[1] = Load;
MadeChange |= annotateGroupSizeLoadWithRangeMD(Load, true);
}
break;
case HIDDEN_REMAINDER_Z:
if (LoadSize == 2)
if (LoadSize == 2) {
Remainders[2] = Load;
MadeChange |= annotateGroupSizeLoadWithRangeMD(Load, true);
}
break;
default:
break;
@ -229,7 +259,6 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
}
}
bool MadeChange = false;
if (IsV5OrAbove && HasUniformWorkGroupSize) {
// Under v5 __ockl_get_local_size returns the value computed by the
// expression:

View File

@ -907,7 +907,7 @@ public:
/// \returns Maximum flat work group size supported by the subtarget.
unsigned getMaxFlatWorkGroupSize() const override {
return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize();
}
/// \returns Number of waves per execution unit required to support the given

View File

@ -144,7 +144,7 @@ public:
/// \returns Maximum flat work group size supported by the subtarget.
unsigned getMaxFlatWorkGroupSize() const override {
return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize();
}
/// \returns Number of waves per execution unit required to support the given

View File

@ -1265,11 +1265,6 @@ unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo *STI,
unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI) { return 1; }
unsigned getMaxFlatWorkGroupSize(const MCSubtargetInfo *STI) {
// Some subtargets allow encoding 2048, but this isn't tested or supported.
return 1024;
}
unsigned getWavesPerWorkGroup(const MCSubtargetInfo *STI,
unsigned FlatWorkGroupSize) {
return divideCeil(FlatWorkGroupSize, getWavefrontSize(STI));

View File

@ -265,8 +265,11 @@ unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo *STI,
/// \returns Minimum flat work group size for given subtarget \p STI.
unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI);
/// \returns Maximum flat work group size for given subtarget \p STI.
unsigned getMaxFlatWorkGroupSize(const MCSubtargetInfo *STI);
/// \returns Maximum flat work group size
constexpr unsigned getMaxFlatWorkGroupSize() {
// Some subtargets allow encoding 2048, but this isn't tested or supported.
return 1024;
}
/// \returns Number of waves per work group for given subtarget \p STI and
/// \p FlatWorkGroupSize.

View File

@ -17,7 +17,7 @@ define i32 @use_grid_size_x_max_num_workgroups_existing_nonzero_range() #0 {
; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_existing_nonzero_range(
; CHECK-SAME: ) #[[ATTR0]] {
; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG0]]
; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG1:![0-9]+]]
; CHECK-NEXT: ret i32 [[GRID_SIZE_X]]
;
%implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
@ -30,7 +30,7 @@ define i32 @use_grid_size_y_max_num_workgroups() #0 {
; CHECK-SAME: ) #[[ATTR0]] {
; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; CHECK-NEXT: [[GEP_GRID_SIZE_Y:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 4
; CHECK-NEXT: [[GRID_SIZE_Y:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Y]], align 4, !range [[RNG1:![0-9]+]]
; CHECK-NEXT: [[GRID_SIZE_Y:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Y]], align 4, !range [[RNG2:![0-9]+]]
; CHECK-NEXT: ret i32 [[GRID_SIZE_Y]]
;
%implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
@ -44,7 +44,7 @@ define i32 @use_grid_size_z_max_num_workgroups() #0 {
; CHECK-SAME: ) #[[ATTR0]] {
; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; CHECK-NEXT: [[GEP_GRID_SIZE_Z:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 8
; CHECK-NEXT: [[GRID_SIZE_Z:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Z]], align 4, !range [[RNG2:![0-9]+]]
; CHECK-NEXT: [[GRID_SIZE_Z:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Z]], align 4, !range [[RNG3:![0-9]+]]
; CHECK-NEXT: ret i32 [[GRID_SIZE_Z]]
;
%implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
@ -69,7 +69,7 @@ define i32 @use_grid_size_x_max_num_workgroups_max_minus_1() #1 {
; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_max_minus_1(
; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG3:![0-9]+]]
; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG4:![0-9]+]]
; CHECK-NEXT: ret i32 [[GRID_SIZE_X]]
;
%implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
@ -118,7 +118,8 @@ attributes #3 = { "amdgpu-max-num-workgroups"="0,42,89" }
; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
;.
; CHECK: [[RNG0]] = !{i32 1, i32 37}
; CHECK: [[RNG1]] = !{i32 1, i32 43}
; CHECK: [[RNG2]] = !{i32 1, i32 90}
; CHECK: [[RNG3]] = !{i32 1, i32 -1}
; CHECK: [[RNG1]] = !{i32 0, i32 -1}
; CHECK: [[RNG2]] = !{i32 1, i32 43}
; CHECK: [[RNG3]] = !{i32 1, i32 90}
; CHECK: [[RNG4]] = !{i32 1, i32 -1}
;.

View File

@ -159,8 +159,8 @@ define i32 @bad_offset() {
; CHECK-NEXT: [[GRID_SIZE_Y:%.*]] = load i32, ptr addrspace(4) [[D_GEP_Y]], align 4
; CHECK-NEXT: [[IMPLICITARG:%.*]] = call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; CHECK-NEXT: [[I_GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG]], i64 12
; CHECK-NEXT: [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2
; CHECK-NEXT: [[CONV_X:%.*]] = zext i16 [[WG_SIZE_X]] to i32
; CHECK-NEXT: [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2, !range [[RNG1:![0-9]+]]
; CHECK-NEXT: [[CONV_X:%.*]] = zext nneg i16 [[WG_SIZE_X]] to i32
; CHECK-NEXT: [[COUNT_X:%.*]] = udiv i32 [[GRID_SIZE_Y]], [[CONV_X]]
; CHECK-NEXT: ret i32 [[COUNT_X]]
;
@ -203,8 +203,8 @@ define i32 @wrong_cast() {
; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[D_GEP_X]], align 4
; CHECK-NEXT: [[IMPLICITARG:%.*]] = call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; CHECK-NEXT: [[I_GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG]], i64 12
; CHECK-NEXT: [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2
; CHECK-NEXT: [[CONV_X:%.*]] = sext i16 [[WG_SIZE_X]] to i32
; CHECK-NEXT: [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2, !range [[RNG1]]
; CHECK-NEXT: [[CONV_X:%.*]] = zext nneg i16 [[WG_SIZE_X]] to i32
; CHECK-NEXT: [[COUNT_X:%.*]] = udiv i32 [[GRID_SIZE_X]], [[CONV_X]]
; CHECK-NEXT: ret i32 [[COUNT_X]]
;
@ -253,8 +253,8 @@ define i32 @wrong_intrinsic() {
; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[D_GEP_X]], align 4
; CHECK-NEXT: [[IMPLICITARG:%.*]] = call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; CHECK-NEXT: [[I_GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG]], i64 12
; CHECK-NEXT: [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2
; CHECK-NEXT: [[CONV_X:%.*]] = zext i16 [[WG_SIZE_X]] to i32
; CHECK-NEXT: [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2, !range [[RNG1]]
; CHECK-NEXT: [[CONV_X:%.*]] = zext nneg i16 [[WG_SIZE_X]] to i32
; CHECK-NEXT: [[COUNT_X:%.*]] = udiv i32 [[GRID_SIZE_X]], [[CONV_X]]
; CHECK-NEXT: ret i32 [[COUNT_X]]
;
@ -279,7 +279,7 @@ define i16 @empty_use() {
; CHECK-NEXT: [[TRUNC_X:%.*]] = trunc i32 [[GRID_SIZE_X]] to i16
; CHECK-NEXT: [[IMPLICITARG:%.*]] = call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; CHECK-NEXT: [[I_GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG]], i64 12
; CHECK-NEXT: [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2
; CHECK-NEXT: [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2, !range [[RNG1]]
; CHECK-NEXT: [[COUNT_X:%.*]] = udiv i16 [[TRUNC_X]], [[WG_SIZE_X]]
; CHECK-NEXT: ret i16 [[COUNT_X]]
;
@ -319,4 +319,5 @@ entry:
}
;.
; CHECK: [[META0]] = !{}
; CHECK: [[RNG1]] = !{i16 1, i16 1024}
;.

View File

@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine,infer-alignment %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals smart
; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine,infer-alignment %s | FileCheck -check-prefix=GCN %s
; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
define amdgpu_kernel void @get_local_size_x(ptr addrspace(1) %out) #0 {
@ -63,9 +63,53 @@ define amdgpu_kernel void @get_local_size_z(ptr addrspace(1) %out) #0 {
ret void
}
; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
define amdgpu_kernel void @get_remainder_x(ptr addrspace(1) %out) #0 {
define amdgpu_kernel void @get_remainder_x(ptr addrspace(1) %out) #2 {
; GCN-LABEL: @get_remainder_x(
; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; GCN-NEXT: [[GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 18
; GCN-NEXT: [[REMAINDER_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 2, !range [[RNG1:![0-9]+]]
; GCN-NEXT: store i16 [[REMAINDER_X]], ptr addrspace(1) [[OUT:%.*]], align 2
; GCN-NEXT: ret void
;
%implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep.x = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 18
%remainder.x = load i16, ptr addrspace(4) %gep.x, align 2
store i16 %remainder.x, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @get_remainder_y(ptr addrspace(1) %out) #2 {
; GCN-LABEL: @get_remainder_y(
; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; GCN-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 20
; GCN-NEXT: [[REMAINDER_Y:%.*]] = load i16, ptr addrspace(4) [[GEP_Y]], align 4, !range [[RNG1]]
; GCN-NEXT: store i16 [[REMAINDER_Y]], ptr addrspace(1) [[OUT:%.*]], align 2
; GCN-NEXT: ret void
;
%implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep.y = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 20
%remainder.y = load i16, ptr addrspace(4) %gep.y, align 2
store i16 %remainder.y, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @get_remainder_z(ptr addrspace(1) %out) #2 {
; GCN-LABEL: @get_remainder_z(
; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; GCN-NEXT: [[GEP_Z:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 22
; GCN-NEXT: [[REMAINDER_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 2, !range [[RNG1]]
; GCN-NEXT: store i16 [[REMAINDER_Z]], ptr addrspace(1) [[OUT:%.*]], align 2
; GCN-NEXT: ret void
;
%implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep.z = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 22
%remainder.z = load i16, ptr addrspace(4) %gep.z, align 2
store i16 %remainder.z, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @get_remainder_x_uniform(ptr addrspace(1) %out) #0 {
; GCN-LABEL: @get_remainder_x_uniform(
; GCN-NEXT: store i16 0, ptr addrspace(1) [[OUT:%.*]], align 2
; GCN-NEXT: ret void
;
@ -76,27 +120,25 @@ define amdgpu_kernel void @get_remainder_x(ptr addrspace(1) %out) #0 {
ret void
}
; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
define amdgpu_kernel void @get_remainder_y(ptr addrspace(1) %out) #0 {
; GCN-LABEL: @get_remainder_y(
define amdgpu_kernel void @get_remainder_y_uniform(ptr addrspace(1) %out) #0 {
; GCN-LABEL: @get_remainder_y_uniform(
; GCN-NEXT: store i16 0, ptr addrspace(1) [[OUT:%.*]], align 2
; GCN-NEXT: ret void
;
%implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep.y = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 18
%gep.y = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 20
%remainder.y = load i16, ptr addrspace(4) %gep.y, align 2
store i16 %remainder.y, ptr addrspace(1) %out
ret void
}
; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
define amdgpu_kernel void @get_remainder_z(ptr addrspace(1) %out) #0 {
; GCN-LABEL: @get_remainder_z(
define amdgpu_kernel void @get_remainder_z_uniform(ptr addrspace(1) %out) #0 {
; GCN-LABEL: @get_remainder_z_uniform(
; GCN-NEXT: store i16 0, ptr addrspace(1) [[OUT:%.*]], align 2
; GCN-NEXT: ret void
;
%implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep.z = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 18
%gep.z = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 22
%remainder.z = load i16, ptr addrspace(4) %gep.z, align 2
store i16 %remainder.z, ptr addrspace(1) %out
ret void
@ -107,7 +149,7 @@ define amdgpu_kernel void @get_work_group_size_x(ptr addrspace(1) %out) #0 {
; GCN-LABEL: @get_work_group_size_x(
; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; GCN-NEXT: [[GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 12
; GCN-NEXT: [[GROUP_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 4
; GCN-NEXT: [[GROUP_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 4, !range [[RNG2:![0-9]+]]
; GCN-NEXT: store i16 [[GROUP_SIZE_X]], ptr addrspace(1) [[OUT:%.*]], align 2
; GCN-NEXT: ret void
;
@ -123,7 +165,7 @@ define amdgpu_kernel void @get_work_group_size_y(ptr addrspace(1) %out) #0 {
; GCN-LABEL: @get_work_group_size_y(
; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; GCN-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 14
; GCN-NEXT: [[GROUP_SIZE_Y:%.*]] = load i16, ptr addrspace(4) [[GEP_Y]], align 2
; GCN-NEXT: [[GROUP_SIZE_Y:%.*]] = load i16, ptr addrspace(4) [[GEP_Y]], align 2, !range [[RNG2]]
; GCN-NEXT: store i16 [[GROUP_SIZE_Y]], ptr addrspace(1) [[OUT:%.*]], align 2
; GCN-NEXT: ret void
;
@ -139,7 +181,7 @@ define amdgpu_kernel void @get_work_group_size_z(ptr addrspace(1) %out) #0 {
; GCN-LABEL: @get_work_group_size_z(
; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; GCN-NEXT: [[GEP_Z:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 16
; GCN-NEXT: [[GROUP_SIZE_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 4
; GCN-NEXT: [[GROUP_SIZE_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 4, !range [[RNG2]]
; GCN-NEXT: store i16 [[GROUP_SIZE_Z]], ptr addrspace(1) [[OUT:%.*]], align 2
; GCN-NEXT: ret void
;
@ -189,6 +231,65 @@ define amdgpu_kernel void @get_work_group_size_z_reqd(ptr addrspace(1) %out) #0
ret void
}
define amdgpu_kernel void @get_remainder_x_wrong_type(ptr addrspace(1) %out) #2 {
; GCN-LABEL: @get_remainder_x_wrong_type(
; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; GCN-NEXT: [[GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 18
; GCN-NEXT: [[REMAINDER_X:%.*]] = load half, ptr addrspace(4) [[GEP_X]], align 2
; GCN-NEXT: store half [[REMAINDER_X]], ptr addrspace(1) [[OUT:%.*]], align 2
; GCN-NEXT: ret void
;
%implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep.x = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 18
%remainder.x = load half, ptr addrspace(4) %gep.x, align 2
store half %remainder.x, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @get_remainder_y_wrong_type(ptr addrspace(1) %out) #2 {
; GCN-LABEL: @get_remainder_y_wrong_type(
; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; GCN-NEXT: [[GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 20
; GCN-NEXT: [[REMAINDER_X:%.*]] = load half, ptr addrspace(4) [[GEP_X]], align 4
; GCN-NEXT: store half [[REMAINDER_X]], ptr addrspace(1) [[OUT:%.*]], align 2
; GCN-NEXT: ret void
;
%implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep.x = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 20
%remainder.x = load half, ptr addrspace(4) %gep.x, align 2
store half %remainder.x, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @get_remainder_z_wrong_type(ptr addrspace(1) %out) #2 {
; GCN-LABEL: @get_remainder_z_wrong_type(
; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; GCN-NEXT: [[GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 22
; GCN-NEXT: [[REMAINDER_X:%.*]] = load half, ptr addrspace(4) [[GEP_X]], align 2
; GCN-NEXT: store half [[REMAINDER_X]], ptr addrspace(1) [[OUT:%.*]], align 2
; GCN-NEXT: ret void
;
%implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep.x = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 22
%remainder.x = load half, ptr addrspace(4) %gep.x, align 2
store half %remainder.x, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @get_remainder_x_existing_range(ptr addrspace(1) %out) #2 {
; GCN-LABEL: @get_remainder_x_existing_range(
; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; GCN-NEXT: [[GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 18
; GCN-NEXT: [[REMAINDER_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 2, !range [[RNG4:![0-9]+]]
; GCN-NEXT: store i16 [[REMAINDER_X]], ptr addrspace(1) [[OUT:%.*]], align 2
; GCN-NEXT: ret void
;
%implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep.x = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 18
%remainder.x = load i16, ptr addrspace(4) %gep.x, align 2, !range !{i16 0, i16 10}
store i16 %remainder.x, ptr addrspace(1) %out
ret void
}
declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #1
declare i32 @llvm.amdgcn.workgroup.id.x() #1
@ -199,5 +300,11 @@ declare i32 @llvm.amdgcn.workgroup.id.z() #1
attributes #0 = { nounwind "uniform-work-group-size" }
attributes #1 = { nounwind readnone speculatable }
attributes #2 = { nounwind }
!0 = !{i32 8, i32 16, i32 2}
!1 = !{i32 1, !"amdhsa_code_object_version", i32 500}
;.
; GCN: [[RNG1]] = !{i16 0, i16 1023}
; GCN: [[RNG2]] = !{i16 1, i16 1024}
; GCN: [[RNG4]] = !{i16 0, i16 10}
;.