AMDGPU: Annotate group size ABI loads with range metadata (#185420)
We previously did the same for the grid size when annotated. The group size is easier, so it's weird that this wasn't implemented first.
This commit is contained in:
parent
f1a2fd2abb
commit
76daf31b40
@ -82,18 +82,38 @@ Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
|
||||
|
||||
} // end anonymous namespace
|
||||
|
||||
static void annotateGridSizeLoadWithRangeMD(LoadInst *Load,
|
||||
static bool annotateGridSizeLoadWithRangeMD(LoadInst *Load,
|
||||
uint32_t MaxNumGroups) {
|
||||
if (MaxNumGroups == 0 || MaxNumGroups == std::numeric_limits<uint32_t>::max())
|
||||
return;
|
||||
return false;
|
||||
|
||||
if (!Load->getType()->isIntegerTy(32))
|
||||
return;
|
||||
return false;
|
||||
|
||||
// TODO: If there is existing range metadata, preserve it if it is stricter.
|
||||
if (Load->hasMetadata(LLVMContext::MD_range))
|
||||
return false;
|
||||
|
||||
MDBuilder MDB(Load->getContext());
|
||||
MDNode *Range = MDB.createRange(APInt(32, 1), APInt(32, MaxNumGroups + 1));
|
||||
Load->setMetadata(LLVMContext::MD_range, Range);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool annotateGroupSizeLoadWithRangeMD(LoadInst *Load, bool IsRemainder) {
|
||||
if (!Load->getType()->isIntegerTy(16))
|
||||
return false;
|
||||
|
||||
// TODO: If there is existing range metadata, preserve it if it is stricter.
|
||||
if (Load->hasMetadata(LLVMContext::MD_range))
|
||||
return false;
|
||||
|
||||
MDBuilder MDB(Load->getContext());
|
||||
MDNode *Range = MDB.createRange(
|
||||
APInt(16, !IsRemainder),
|
||||
APInt(16, AMDGPU::IsaInfo::getMaxFlatWorkGroupSize() - IsRemainder));
|
||||
Load->setMetadata(LLVMContext::MD_range, Range);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool processUse(CallInst *CI, bool IsV5OrAbove) {
|
||||
@ -109,18 +129,13 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
|
||||
AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups",
|
||||
/*Size=*/3, /*DefaultVal=*/0);
|
||||
|
||||
if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize &&
|
||||
!Intrinsic::getDeclarationIfExists(CI->getModule(),
|
||||
Intrinsic::amdgcn_dispatch_ptr) &&
|
||||
none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; }))
|
||||
return false;
|
||||
|
||||
Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
|
||||
Value *GroupSizes[3] = {nullptr, nullptr, nullptr};
|
||||
Value *Remainders[3] = {nullptr, nullptr, nullptr};
|
||||
Value *GridSizes[3] = {nullptr, nullptr, nullptr};
|
||||
|
||||
const DataLayout &DL = F->getDataLayout();
|
||||
bool MadeChange = false;
|
||||
|
||||
// We expect to see several GEP users, casted to the appropriate type and
|
||||
// loaded.
|
||||
@ -155,44 +170,59 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
|
||||
case HIDDEN_BLOCK_COUNT_X:
|
||||
if (LoadSize == 4) {
|
||||
BlockCounts[0] = Load;
|
||||
annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]);
|
||||
MadeChange |=
|
||||
annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]);
|
||||
}
|
||||
break;
|
||||
case HIDDEN_BLOCK_COUNT_Y:
|
||||
if (LoadSize == 4) {
|
||||
BlockCounts[1] = Load;
|
||||
annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]);
|
||||
MadeChange |=
|
||||
annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]);
|
||||
}
|
||||
break;
|
||||
case HIDDEN_BLOCK_COUNT_Z:
|
||||
if (LoadSize == 4) {
|
||||
BlockCounts[2] = Load;
|
||||
annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]);
|
||||
MadeChange |=
|
||||
annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]);
|
||||
}
|
||||
break;
|
||||
case HIDDEN_GROUP_SIZE_X:
|
||||
if (LoadSize == 2)
|
||||
if (LoadSize == 2) {
|
||||
GroupSizes[0] = Load;
|
||||
MadeChange |= annotateGroupSizeLoadWithRangeMD(Load, false);
|
||||
}
|
||||
break;
|
||||
case HIDDEN_GROUP_SIZE_Y:
|
||||
if (LoadSize == 2)
|
||||
if (LoadSize == 2) {
|
||||
GroupSizes[1] = Load;
|
||||
MadeChange |= annotateGroupSizeLoadWithRangeMD(Load, false);
|
||||
}
|
||||
break;
|
||||
case HIDDEN_GROUP_SIZE_Z:
|
||||
if (LoadSize == 2)
|
||||
if (LoadSize == 2) {
|
||||
GroupSizes[2] = Load;
|
||||
MadeChange |= annotateGroupSizeLoadWithRangeMD(Load, false);
|
||||
}
|
||||
break;
|
||||
case HIDDEN_REMAINDER_X:
|
||||
if (LoadSize == 2)
|
||||
if (LoadSize == 2) {
|
||||
Remainders[0] = Load;
|
||||
MadeChange |= annotateGroupSizeLoadWithRangeMD(Load, true);
|
||||
}
|
||||
break;
|
||||
case HIDDEN_REMAINDER_Y:
|
||||
if (LoadSize == 2)
|
||||
if (LoadSize == 2) {
|
||||
Remainders[1] = Load;
|
||||
MadeChange |= annotateGroupSizeLoadWithRangeMD(Load, true);
|
||||
}
|
||||
break;
|
||||
case HIDDEN_REMAINDER_Z:
|
||||
if (LoadSize == 2)
|
||||
if (LoadSize == 2) {
|
||||
Remainders[2] = Load;
|
||||
MadeChange |= annotateGroupSizeLoadWithRangeMD(Load, true);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@ -229,7 +259,6 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
|
||||
}
|
||||
}
|
||||
|
||||
bool MadeChange = false;
|
||||
if (IsV5OrAbove && HasUniformWorkGroupSize) {
|
||||
// Under v5 __ockl_get_local_size returns the value computed by the
|
||||
// expression:
|
||||
|
||||
@ -907,7 +907,7 @@ public:
|
||||
|
||||
/// \returns Maximum flat work group size supported by the subtarget.
|
||||
unsigned getMaxFlatWorkGroupSize() const override {
|
||||
return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
|
||||
return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize();
|
||||
}
|
||||
|
||||
/// \returns Number of waves per execution unit required to support the given
|
||||
|
||||
@ -144,7 +144,7 @@ public:
|
||||
|
||||
/// \returns Maximum flat work group size supported by the subtarget.
|
||||
unsigned getMaxFlatWorkGroupSize() const override {
|
||||
return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
|
||||
return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize();
|
||||
}
|
||||
|
||||
/// \returns Number of waves per execution unit required to support the given
|
||||
|
||||
@ -1265,11 +1265,6 @@ unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo *STI,
|
||||
|
||||
unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI) { return 1; }
|
||||
|
||||
unsigned getMaxFlatWorkGroupSize(const MCSubtargetInfo *STI) {
|
||||
// Some subtargets allow encoding 2048, but this isn't tested or supported.
|
||||
return 1024;
|
||||
}
|
||||
|
||||
unsigned getWavesPerWorkGroup(const MCSubtargetInfo *STI,
|
||||
unsigned FlatWorkGroupSize) {
|
||||
return divideCeil(FlatWorkGroupSize, getWavefrontSize(STI));
|
||||
|
||||
@ -265,8 +265,11 @@ unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo *STI,
|
||||
/// \returns Minimum flat work group size for given subtarget \p STI.
|
||||
unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI);
|
||||
|
||||
/// \returns Maximum flat work group size for given subtarget \p STI.
|
||||
unsigned getMaxFlatWorkGroupSize(const MCSubtargetInfo *STI);
|
||||
/// \returns Maximum flat work group size
|
||||
constexpr unsigned getMaxFlatWorkGroupSize() {
|
||||
// Some subtargets allow encoding 2048, but this isn't tested or supported.
|
||||
return 1024;
|
||||
}
|
||||
|
||||
/// \returns Number of waves per work group for given subtarget \p STI and
|
||||
/// \p FlatWorkGroupSize.
|
||||
|
||||
@ -17,7 +17,7 @@ define i32 @use_grid_size_x_max_num_workgroups_existing_nonzero_range() #0 {
|
||||
; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_existing_nonzero_range(
|
||||
; CHECK-SAME: ) #[[ATTR0]] {
|
||||
; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG0]]
|
||||
; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG1:![0-9]+]]
|
||||
; CHECK-NEXT: ret i32 [[GRID_SIZE_X]]
|
||||
;
|
||||
%implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
@ -30,7 +30,7 @@ define i32 @use_grid_size_y_max_num_workgroups() #0 {
|
||||
; CHECK-SAME: ) #[[ATTR0]] {
|
||||
; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
; CHECK-NEXT: [[GEP_GRID_SIZE_Y:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 4
|
||||
; CHECK-NEXT: [[GRID_SIZE_Y:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Y]], align 4, !range [[RNG1:![0-9]+]]
|
||||
; CHECK-NEXT: [[GRID_SIZE_Y:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Y]], align 4, !range [[RNG2:![0-9]+]]
|
||||
; CHECK-NEXT: ret i32 [[GRID_SIZE_Y]]
|
||||
;
|
||||
%implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
@ -44,7 +44,7 @@ define i32 @use_grid_size_z_max_num_workgroups() #0 {
|
||||
; CHECK-SAME: ) #[[ATTR0]] {
|
||||
; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
; CHECK-NEXT: [[GEP_GRID_SIZE_Z:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 8
|
||||
; CHECK-NEXT: [[GRID_SIZE_Z:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Z]], align 4, !range [[RNG2:![0-9]+]]
|
||||
; CHECK-NEXT: [[GRID_SIZE_Z:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Z]], align 4, !range [[RNG3:![0-9]+]]
|
||||
; CHECK-NEXT: ret i32 [[GRID_SIZE_Z]]
|
||||
;
|
||||
%implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
@ -69,7 +69,7 @@ define i32 @use_grid_size_x_max_num_workgroups_max_minus_1() #1 {
|
||||
; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_max_minus_1(
|
||||
; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
|
||||
; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG3:![0-9]+]]
|
||||
; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG4:![0-9]+]]
|
||||
; CHECK-NEXT: ret i32 [[GRID_SIZE_X]]
|
||||
;
|
||||
%implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
@ -118,7 +118,8 @@ attributes #3 = { "amdgpu-max-num-workgroups"="0,42,89" }
|
||||
; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
||||
;.
|
||||
; CHECK: [[RNG0]] = !{i32 1, i32 37}
|
||||
; CHECK: [[RNG1]] = !{i32 1, i32 43}
|
||||
; CHECK: [[RNG2]] = !{i32 1, i32 90}
|
||||
; CHECK: [[RNG3]] = !{i32 1, i32 -1}
|
||||
; CHECK: [[RNG1]] = !{i32 0, i32 -1}
|
||||
; CHECK: [[RNG2]] = !{i32 1, i32 43}
|
||||
; CHECK: [[RNG3]] = !{i32 1, i32 90}
|
||||
; CHECK: [[RNG4]] = !{i32 1, i32 -1}
|
||||
;.
|
||||
|
||||
@ -159,8 +159,8 @@ define i32 @bad_offset() {
|
||||
; CHECK-NEXT: [[GRID_SIZE_Y:%.*]] = load i32, ptr addrspace(4) [[D_GEP_Y]], align 4
|
||||
; CHECK-NEXT: [[IMPLICITARG:%.*]] = call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
; CHECK-NEXT: [[I_GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG]], i64 12
|
||||
; CHECK-NEXT: [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2
|
||||
; CHECK-NEXT: [[CONV_X:%.*]] = zext i16 [[WG_SIZE_X]] to i32
|
||||
; CHECK-NEXT: [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2, !range [[RNG1:![0-9]+]]
|
||||
; CHECK-NEXT: [[CONV_X:%.*]] = zext nneg i16 [[WG_SIZE_X]] to i32
|
||||
; CHECK-NEXT: [[COUNT_X:%.*]] = udiv i32 [[GRID_SIZE_Y]], [[CONV_X]]
|
||||
; CHECK-NEXT: ret i32 [[COUNT_X]]
|
||||
;
|
||||
@ -203,8 +203,8 @@ define i32 @wrong_cast() {
|
||||
; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[D_GEP_X]], align 4
|
||||
; CHECK-NEXT: [[IMPLICITARG:%.*]] = call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
; CHECK-NEXT: [[I_GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG]], i64 12
|
||||
; CHECK-NEXT: [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2
|
||||
; CHECK-NEXT: [[CONV_X:%.*]] = sext i16 [[WG_SIZE_X]] to i32
|
||||
; CHECK-NEXT: [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2, !range [[RNG1]]
|
||||
; CHECK-NEXT: [[CONV_X:%.*]] = zext nneg i16 [[WG_SIZE_X]] to i32
|
||||
; CHECK-NEXT: [[COUNT_X:%.*]] = udiv i32 [[GRID_SIZE_X]], [[CONV_X]]
|
||||
; CHECK-NEXT: ret i32 [[COUNT_X]]
|
||||
;
|
||||
@ -253,8 +253,8 @@ define i32 @wrong_intrinsic() {
|
||||
; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[D_GEP_X]], align 4
|
||||
; CHECK-NEXT: [[IMPLICITARG:%.*]] = call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
; CHECK-NEXT: [[I_GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG]], i64 12
|
||||
; CHECK-NEXT: [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2
|
||||
; CHECK-NEXT: [[CONV_X:%.*]] = zext i16 [[WG_SIZE_X]] to i32
|
||||
; CHECK-NEXT: [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2, !range [[RNG1]]
|
||||
; CHECK-NEXT: [[CONV_X:%.*]] = zext nneg i16 [[WG_SIZE_X]] to i32
|
||||
; CHECK-NEXT: [[COUNT_X:%.*]] = udiv i32 [[GRID_SIZE_X]], [[CONV_X]]
|
||||
; CHECK-NEXT: ret i32 [[COUNT_X]]
|
||||
;
|
||||
@ -279,7 +279,7 @@ define i16 @empty_use() {
|
||||
; CHECK-NEXT: [[TRUNC_X:%.*]] = trunc i32 [[GRID_SIZE_X]] to i16
|
||||
; CHECK-NEXT: [[IMPLICITARG:%.*]] = call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
; CHECK-NEXT: [[I_GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG]], i64 12
|
||||
; CHECK-NEXT: [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2
|
||||
; CHECK-NEXT: [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2, !range [[RNG1]]
|
||||
; CHECK-NEXT: [[COUNT_X:%.*]] = udiv i16 [[TRUNC_X]], [[WG_SIZE_X]]
|
||||
; CHECK-NEXT: ret i16 [[COUNT_X]]
|
||||
;
|
||||
@ -319,4 +319,5 @@ entry:
|
||||
}
|
||||
;.
|
||||
; CHECK: [[META0]] = !{}
|
||||
; CHECK: [[RNG1]] = !{i16 1, i16 1024}
|
||||
;.
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine,infer-alignment %s | FileCheck -enable-var-scope -check-prefix=GCN %s
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals smart
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine,infer-alignment %s | FileCheck -check-prefix=GCN %s
|
||||
|
||||
; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
|
||||
define amdgpu_kernel void @get_local_size_x(ptr addrspace(1) %out) #0 {
|
||||
@ -63,9 +63,53 @@ define amdgpu_kernel void @get_local_size_z(ptr addrspace(1) %out) #0 {
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
|
||||
define amdgpu_kernel void @get_remainder_x(ptr addrspace(1) %out) #0 {
|
||||
define amdgpu_kernel void @get_remainder_x(ptr addrspace(1) %out) #2 {
|
||||
; GCN-LABEL: @get_remainder_x(
|
||||
; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
; GCN-NEXT: [[GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 18
|
||||
; GCN-NEXT: [[REMAINDER_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 2, !range [[RNG1:![0-9]+]]
|
||||
; GCN-NEXT: store i16 [[REMAINDER_X]], ptr addrspace(1) [[OUT:%.*]], align 2
|
||||
; GCN-NEXT: ret void
|
||||
;
|
||||
%implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
%gep.x = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 18
|
||||
%remainder.x = load i16, ptr addrspace(4) %gep.x, align 2
|
||||
store i16 %remainder.x, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @get_remainder_y(ptr addrspace(1) %out) #2 {
|
||||
; GCN-LABEL: @get_remainder_y(
|
||||
; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
; GCN-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 20
|
||||
; GCN-NEXT: [[REMAINDER_Y:%.*]] = load i16, ptr addrspace(4) [[GEP_Y]], align 4, !range [[RNG1]]
|
||||
; GCN-NEXT: store i16 [[REMAINDER_Y]], ptr addrspace(1) [[OUT:%.*]], align 2
|
||||
; GCN-NEXT: ret void
|
||||
;
|
||||
%implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
%gep.y = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 20
|
||||
%remainder.y = load i16, ptr addrspace(4) %gep.y, align 2
|
||||
store i16 %remainder.y, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @get_remainder_z(ptr addrspace(1) %out) #2 {
|
||||
; GCN-LABEL: @get_remainder_z(
|
||||
; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
; GCN-NEXT: [[GEP_Z:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 22
|
||||
; GCN-NEXT: [[REMAINDER_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 2, !range [[RNG1]]
|
||||
; GCN-NEXT: store i16 [[REMAINDER_Z]], ptr addrspace(1) [[OUT:%.*]], align 2
|
||||
; GCN-NEXT: ret void
|
||||
;
|
||||
%implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
%gep.z = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 22
|
||||
%remainder.z = load i16, ptr addrspace(4) %gep.z, align 2
|
||||
store i16 %remainder.z, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @get_remainder_x_uniform(ptr addrspace(1) %out) #0 {
|
||||
; GCN-LABEL: @get_remainder_x_uniform(
|
||||
; GCN-NEXT: store i16 0, ptr addrspace(1) [[OUT:%.*]], align 2
|
||||
; GCN-NEXT: ret void
|
||||
;
|
||||
@ -76,27 +120,25 @@ define amdgpu_kernel void @get_remainder_x(ptr addrspace(1) %out) #0 {
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
|
||||
define amdgpu_kernel void @get_remainder_y(ptr addrspace(1) %out) #0 {
|
||||
; GCN-LABEL: @get_remainder_y(
|
||||
define amdgpu_kernel void @get_remainder_y_uniform(ptr addrspace(1) %out) #0 {
|
||||
; GCN-LABEL: @get_remainder_y_uniform(
|
||||
; GCN-NEXT: store i16 0, ptr addrspace(1) [[OUT:%.*]], align 2
|
||||
; GCN-NEXT: ret void
|
||||
;
|
||||
%implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
%gep.y = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 18
|
||||
%gep.y = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 20
|
||||
%remainder.y = load i16, ptr addrspace(4) %gep.y, align 2
|
||||
store i16 %remainder.y, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
|
||||
define amdgpu_kernel void @get_remainder_z(ptr addrspace(1) %out) #0 {
|
||||
; GCN-LABEL: @get_remainder_z(
|
||||
define amdgpu_kernel void @get_remainder_z_uniform(ptr addrspace(1) %out) #0 {
|
||||
; GCN-LABEL: @get_remainder_z_uniform(
|
||||
; GCN-NEXT: store i16 0, ptr addrspace(1) [[OUT:%.*]], align 2
|
||||
; GCN-NEXT: ret void
|
||||
;
|
||||
%implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
%gep.z = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 18
|
||||
%gep.z = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 22
|
||||
%remainder.z = load i16, ptr addrspace(4) %gep.z, align 2
|
||||
store i16 %remainder.z, ptr addrspace(1) %out
|
||||
ret void
|
||||
@ -107,7 +149,7 @@ define amdgpu_kernel void @get_work_group_size_x(ptr addrspace(1) %out) #0 {
|
||||
; GCN-LABEL: @get_work_group_size_x(
|
||||
; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
; GCN-NEXT: [[GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 12
|
||||
; GCN-NEXT: [[GROUP_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 4
|
||||
; GCN-NEXT: [[GROUP_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 4, !range [[RNG2:![0-9]+]]
|
||||
; GCN-NEXT: store i16 [[GROUP_SIZE_X]], ptr addrspace(1) [[OUT:%.*]], align 2
|
||||
; GCN-NEXT: ret void
|
||||
;
|
||||
@ -123,7 +165,7 @@ define amdgpu_kernel void @get_work_group_size_y(ptr addrspace(1) %out) #0 {
|
||||
; GCN-LABEL: @get_work_group_size_y(
|
||||
; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
; GCN-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 14
|
||||
; GCN-NEXT: [[GROUP_SIZE_Y:%.*]] = load i16, ptr addrspace(4) [[GEP_Y]], align 2
|
||||
; GCN-NEXT: [[GROUP_SIZE_Y:%.*]] = load i16, ptr addrspace(4) [[GEP_Y]], align 2, !range [[RNG2]]
|
||||
; GCN-NEXT: store i16 [[GROUP_SIZE_Y]], ptr addrspace(1) [[OUT:%.*]], align 2
|
||||
; GCN-NEXT: ret void
|
||||
;
|
||||
@ -139,7 +181,7 @@ define amdgpu_kernel void @get_work_group_size_z(ptr addrspace(1) %out) #0 {
|
||||
; GCN-LABEL: @get_work_group_size_z(
|
||||
; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
; GCN-NEXT: [[GEP_Z:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 16
|
||||
; GCN-NEXT: [[GROUP_SIZE_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 4
|
||||
; GCN-NEXT: [[GROUP_SIZE_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 4, !range [[RNG2]]
|
||||
; GCN-NEXT: store i16 [[GROUP_SIZE_Z]], ptr addrspace(1) [[OUT:%.*]], align 2
|
||||
; GCN-NEXT: ret void
|
||||
;
|
||||
@ -189,6 +231,65 @@ define amdgpu_kernel void @get_work_group_size_z_reqd(ptr addrspace(1) %out) #0
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @get_remainder_x_wrong_type(ptr addrspace(1) %out) #2 {
|
||||
; GCN-LABEL: @get_remainder_x_wrong_type(
|
||||
; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
; GCN-NEXT: [[GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 18
|
||||
; GCN-NEXT: [[REMAINDER_X:%.*]] = load half, ptr addrspace(4) [[GEP_X]], align 2
|
||||
; GCN-NEXT: store half [[REMAINDER_X]], ptr addrspace(1) [[OUT:%.*]], align 2
|
||||
; GCN-NEXT: ret void
|
||||
;
|
||||
%implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
%gep.x = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 18
|
||||
%remainder.x = load half, ptr addrspace(4) %gep.x, align 2
|
||||
store half %remainder.x, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @get_remainder_y_wrong_type(ptr addrspace(1) %out) #2 {
|
||||
; GCN-LABEL: @get_remainder_y_wrong_type(
|
||||
; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
; GCN-NEXT: [[GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 20
|
||||
; GCN-NEXT: [[REMAINDER_X:%.*]] = load half, ptr addrspace(4) [[GEP_X]], align 4
|
||||
; GCN-NEXT: store half [[REMAINDER_X]], ptr addrspace(1) [[OUT:%.*]], align 2
|
||||
; GCN-NEXT: ret void
|
||||
;
|
||||
%implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
%gep.x = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 20
|
||||
%remainder.x = load half, ptr addrspace(4) %gep.x, align 2
|
||||
store half %remainder.x, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @get_remainder_z_wrong_type(ptr addrspace(1) %out) #2 {
|
||||
; GCN-LABEL: @get_remainder_z_wrong_type(
|
||||
; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
; GCN-NEXT: [[GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 22
|
||||
; GCN-NEXT: [[REMAINDER_X:%.*]] = load half, ptr addrspace(4) [[GEP_X]], align 2
|
||||
; GCN-NEXT: store half [[REMAINDER_X]], ptr addrspace(1) [[OUT:%.*]], align 2
|
||||
; GCN-NEXT: ret void
|
||||
;
|
||||
%implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
%gep.x = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 22
|
||||
%remainder.x = load half, ptr addrspace(4) %gep.x, align 2
|
||||
store half %remainder.x, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @get_remainder_x_existing_range(ptr addrspace(1) %out) #2 {
|
||||
; GCN-LABEL: @get_remainder_x_existing_range(
|
||||
; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
; GCN-NEXT: [[GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 18
|
||||
; GCN-NEXT: [[REMAINDER_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 2, !range [[RNG4:![0-9]+]]
|
||||
; GCN-NEXT: store i16 [[REMAINDER_X]], ptr addrspace(1) [[OUT:%.*]], align 2
|
||||
; GCN-NEXT: ret void
|
||||
;
|
||||
%implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
||||
%gep.x = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 18
|
||||
%remainder.x = load i16, ptr addrspace(4) %gep.x, align 2, !range !{i16 0, i16 10}
|
||||
store i16 %remainder.x, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #1
|
||||
declare i32 @llvm.amdgcn.workgroup.id.x() #1
|
||||
@ -199,5 +300,11 @@ declare i32 @llvm.amdgcn.workgroup.id.z() #1
|
||||
|
||||
attributes #0 = { nounwind "uniform-work-group-size" }
|
||||
attributes #1 = { nounwind readnone speculatable }
|
||||
attributes #2 = { nounwind }
|
||||
!0 = !{i32 8, i32 16, i32 2}
|
||||
!1 = !{i32 1, !"amdhsa_code_object_version", i32 500}
|
||||
;.
|
||||
; GCN: [[RNG1]] = !{i16 0, i16 1023}
|
||||
; GCN: [[RNG2]] = !{i16 1, i16 1024}
|
||||
; GCN: [[RNG4]] = !{i16 0, i16 10}
|
||||
;.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user