diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp index d82dfd277127..02826e423943 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp @@ -82,18 +82,38 @@ Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) { } // end anonymous namespace -static void annotateGridSizeLoadWithRangeMD(LoadInst *Load, +static bool annotateGridSizeLoadWithRangeMD(LoadInst *Load, uint32_t MaxNumGroups) { if (MaxNumGroups == 0 || MaxNumGroups == std::numeric_limits::max()) - return; + return false; if (!Load->getType()->isIntegerTy(32)) - return; + return false; // TODO: If there is existing range metadata, preserve it if it is stricter. + if (Load->hasMetadata(LLVMContext::MD_range)) + return false; + MDBuilder MDB(Load->getContext()); MDNode *Range = MDB.createRange(APInt(32, 1), APInt(32, MaxNumGroups + 1)); Load->setMetadata(LLVMContext::MD_range, Range); + return true; +} + +static bool annotateGroupSizeLoadWithRangeMD(LoadInst *Load, bool IsRemainder) { + if (!Load->getType()->isIntegerTy(16)) + return false; + + // TODO: If there is existing range metadata, preserve it if it is stricter. + if (Load->hasMetadata(LLVMContext::MD_range)) + return false; + + MDBuilder MDB(Load->getContext()); + MDNode *Range = MDB.createRange( + APInt(16, !IsRemainder), + APInt(16, AMDGPU::IsaInfo::getMaxFlatWorkGroupSize() - IsRemainder)); + Load->setMetadata(LLVMContext::MD_range, Range); + return true; } static bool processUse(CallInst *CI, bool IsV5OrAbove) { @@ -109,18 +129,13 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) { AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups", /*Size=*/3, /*DefaultVal=*/0); - if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize && - !Intrinsic::getDeclarationIfExists(CI->getModule(), - Intrinsic::amdgcn_dispatch_ptr) && - none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; })) - return false; - Value *BlockCounts[3] = {nullptr, nullptr, nullptr}; Value *GroupSizes[3] = {nullptr, nullptr, nullptr}; Value *Remainders[3] = {nullptr, nullptr, nullptr}; Value *GridSizes[3] = {nullptr, nullptr, nullptr}; const DataLayout &DL = F->getDataLayout(); + bool MadeChange = false; // We expect to see several GEP users, casted to the appropriate type and // loaded. @@ -155,44 +170,59 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) { case HIDDEN_BLOCK_COUNT_X: if (LoadSize == 4) { BlockCounts[0] = Load; - annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]); + MadeChange |= + annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]); } break; case HIDDEN_BLOCK_COUNT_Y: if (LoadSize == 4) { BlockCounts[1] = Load; - annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]); + MadeChange |= + annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]); } break; case HIDDEN_BLOCK_COUNT_Z: if (LoadSize == 4) { BlockCounts[2] = Load; - annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]); + MadeChange |= + annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]); } break; case HIDDEN_GROUP_SIZE_X: - if (LoadSize == 2) + if (LoadSize == 2) { GroupSizes[0] = Load; + MadeChange |= annotateGroupSizeLoadWithRangeMD(Load, false); + } break; case HIDDEN_GROUP_SIZE_Y: - if (LoadSize == 2) + if (LoadSize == 2) { GroupSizes[1] = Load; + MadeChange |= annotateGroupSizeLoadWithRangeMD(Load, false); + } break; case HIDDEN_GROUP_SIZE_Z: - if (LoadSize == 2) + if (LoadSize == 2) { GroupSizes[2] = Load; + MadeChange |= annotateGroupSizeLoadWithRangeMD(Load, false); + } break; case HIDDEN_REMAINDER_X: - if (LoadSize == 2) + if (LoadSize == 2) { Remainders[0] = Load; + MadeChange |= annotateGroupSizeLoadWithRangeMD(Load, true); + } break; case HIDDEN_REMAINDER_Y: - if (LoadSize == 2) + if (LoadSize == 2) { Remainders[1] = Load; + MadeChange |= annotateGroupSizeLoadWithRangeMD(Load, true); + } break; case HIDDEN_REMAINDER_Z: - if (LoadSize == 2) + if (LoadSize == 2) { Remainders[2] = Load; + MadeChange |= annotateGroupSizeLoadWithRangeMD(Load, true); + } break; default: break; @@ -229,7 +259,6 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) { } } - bool MadeChange = false; if (IsV5OrAbove && HasUniformWorkGroupSize) { // Under v5 __ockl_get_local_size returns the value computed by the // expression: diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index b9c7bad4cef0..012e2dd6b380 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -907,7 +907,7 @@ public: /// \returns Maximum flat work group size supported by the subtarget. unsigned getMaxFlatWorkGroupSize() const override { - return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); + return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(); } /// \returns Number of waves per execution unit required to support the given diff --git a/llvm/lib/Target/AMDGPU/R600Subtarget.h b/llvm/lib/Target/AMDGPU/R600Subtarget.h index 23ea2752be29..33bf4144a104 100644 --- a/llvm/lib/Target/AMDGPU/R600Subtarget.h +++ b/llvm/lib/Target/AMDGPU/R600Subtarget.h @@ -144,7 +144,7 @@ public: /// \returns Maximum flat work group size supported by the subtarget. unsigned getMaxFlatWorkGroupSize() const override { - return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); + return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(); } /// \returns Number of waves per execution unit required to support the given diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 488c150dd5c2..710e9c5166f2 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1265,11 +1265,6 @@ unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo *STI, unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI) { return 1; } -unsigned getMaxFlatWorkGroupSize(const MCSubtargetInfo *STI) { - // Some subtargets allow encoding 2048, but this isn't tested or supported. - return 1024; -} - unsigned getWavesPerWorkGroup(const MCSubtargetInfo *STI, unsigned FlatWorkGroupSize) { return divideCeil(FlatWorkGroupSize, getWavefrontSize(STI)); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index b3d20777ccfc..44462e4c244b 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -265,8 +265,11 @@ unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo *STI, /// \returns Minimum flat work group size for given subtarget \p STI. unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI); -/// \returns Maximum flat work group size for given subtarget \p STI. -unsigned getMaxFlatWorkGroupSize(const MCSubtargetInfo *STI); +/// \returns Maximum flat work group size +constexpr unsigned getMaxFlatWorkGroupSize() { + // Some subtargets allow encoding 2048, but this isn't tested or supported. + return 1024; +} /// \returns Number of waves per work group for given subtarget \p STI and /// \p FlatWorkGroupSize. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll index 906429212992..bc28199ef13d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll @@ -17,7 +17,7 @@ define i32 @use_grid_size_x_max_num_workgroups_existing_nonzero_range() #0 { ; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_existing_nonzero_range( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG0]] +; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG1:![0-9]+]] ; CHECK-NEXT: ret i32 [[GRID_SIZE_X]] ; %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() @@ -30,7 +30,7 @@ define i32 @use_grid_size_y_max_num_workgroups() #0 { ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; CHECK-NEXT: [[GEP_GRID_SIZE_Y:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 4 -; CHECK-NEXT: [[GRID_SIZE_Y:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Y]], align 4, !range [[RNG1:![0-9]+]] +; CHECK-NEXT: [[GRID_SIZE_Y:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Y]], align 4, !range [[RNG2:![0-9]+]] ; CHECK-NEXT: ret i32 [[GRID_SIZE_Y]] ; %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() @@ -44,7 +44,7 @@ define i32 @use_grid_size_z_max_num_workgroups() #0 { ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; CHECK-NEXT: [[GEP_GRID_SIZE_Z:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 8 -; CHECK-NEXT: [[GRID_SIZE_Z:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Z]], align 4, !range [[RNG2:![0-9]+]] +; CHECK-NEXT: [[GRID_SIZE_Z:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Z]], align 4, !range [[RNG3:![0-9]+]] ; CHECK-NEXT: ret i32 [[GRID_SIZE_Z]] ; %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() @@ -69,7 +69,7 @@ define i32 @use_grid_size_x_max_num_workgroups_max_minus_1() #1 { ; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_max_minus_1( ; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG3:![0-9]+]] +; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG4:![0-9]+]] ; CHECK-NEXT: ret i32 [[GRID_SIZE_X]] ; %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() @@ -118,7 +118,8 @@ attributes #3 = { "amdgpu-max-num-workgroups"="0,42,89" } ; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;. ; CHECK: [[RNG0]] = !{i32 1, i32 37} -; CHECK: [[RNG1]] = !{i32 1, i32 43} -; CHECK: [[RNG2]] = !{i32 1, i32 90} -; CHECK: [[RNG3]] = !{i32 1, i32 -1} +; CHECK: [[RNG1]] = !{i32 0, i32 -1} +; CHECK: [[RNG2]] = !{i32 1, i32 43} +; CHECK: [[RNG3]] = !{i32 1, i32 90} +; CHECK: [[RNG4]] = !{i32 1, i32 -1} ;. diff --git a/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll b/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll index d698fb33c502..7bc953a2f635 100644 --- a/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll @@ -159,8 +159,8 @@ define i32 @bad_offset() { ; CHECK-NEXT: [[GRID_SIZE_Y:%.*]] = load i32, ptr addrspace(4) [[D_GEP_Y]], align 4 ; CHECK-NEXT: [[IMPLICITARG:%.*]] = call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; CHECK-NEXT: [[I_GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG]], i64 12 -; CHECK-NEXT: [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2 -; CHECK-NEXT: [[CONV_X:%.*]] = zext i16 [[WG_SIZE_X]] to i32 +; CHECK-NEXT: [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2, !range [[RNG1:![0-9]+]] +; CHECK-NEXT: [[CONV_X:%.*]] = zext nneg i16 [[WG_SIZE_X]] to i32 ; CHECK-NEXT: [[COUNT_X:%.*]] = udiv i32 [[GRID_SIZE_Y]], [[CONV_X]] ; CHECK-NEXT: ret i32 [[COUNT_X]] ; @@ -203,8 +203,8 @@ define i32 @wrong_cast() { ; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[D_GEP_X]], align 4 ; CHECK-NEXT: [[IMPLICITARG:%.*]] = call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; CHECK-NEXT: [[I_GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG]], i64 12 -; CHECK-NEXT: [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2 -; CHECK-NEXT: [[CONV_X:%.*]] = sext i16 [[WG_SIZE_X]] to i32 +; CHECK-NEXT: [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2, !range [[RNG1]] +; CHECK-NEXT: [[CONV_X:%.*]] = zext nneg i16 [[WG_SIZE_X]] to i32 ; CHECK-NEXT: [[COUNT_X:%.*]] = udiv i32 [[GRID_SIZE_X]], [[CONV_X]] ; CHECK-NEXT: ret i32 [[COUNT_X]] ; @@ -253,8 +253,8 @@ define i32 @wrong_intrinsic() { ; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[D_GEP_X]], align 4 ; CHECK-NEXT: [[IMPLICITARG:%.*]] = call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; CHECK-NEXT: [[I_GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG]], i64 12 -; CHECK-NEXT: [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2 -; CHECK-NEXT: [[CONV_X:%.*]] = zext i16 [[WG_SIZE_X]] to i32 +; CHECK-NEXT: [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2, !range [[RNG1]] +; CHECK-NEXT: [[CONV_X:%.*]] = zext nneg i16 [[WG_SIZE_X]] to i32 ; CHECK-NEXT: [[COUNT_X:%.*]] = udiv i32 [[GRID_SIZE_X]], [[CONV_X]] ; CHECK-NEXT: ret i32 [[COUNT_X]] ; @@ -279,7 +279,7 @@ define i16 @empty_use() { ; CHECK-NEXT: [[TRUNC_X:%.*]] = trunc i32 [[GRID_SIZE_X]] to i16 ; CHECK-NEXT: [[IMPLICITARG:%.*]] = call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; CHECK-NEXT: [[I_GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG]], i64 12 -; CHECK-NEXT: [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2 +; CHECK-NEXT: [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2, !range [[RNG1]] ; CHECK-NEXT: [[COUNT_X:%.*]] = udiv i16 [[TRUNC_X]], [[WG_SIZE_X]] ; CHECK-NEXT: ret i16 [[COUNT_X]] ; @@ -319,4 +319,5 @@ entry: } ;. ; CHECK: [[META0]] = !{} +; CHECK: [[RNG1]] = !{i16 1, i16 1024} ;. diff --git a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll index bec673bb23a0..fee955f3da89 100644 --- a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll @@ -1,5 +1,5 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine,infer-alignment %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals smart +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine,infer-alignment %s | FileCheck -check-prefix=GCN %s ; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn define amdgpu_kernel void @get_local_size_x(ptr addrspace(1) %out) #0 { @@ -63,9 +63,53 @@ define amdgpu_kernel void @get_local_size_z(ptr addrspace(1) %out) #0 { ret void } -; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn -define amdgpu_kernel void @get_remainder_x(ptr addrspace(1) %out) #0 { +define amdgpu_kernel void @get_remainder_x(ptr addrspace(1) %out) #2 { ; GCN-LABEL: @get_remainder_x( +; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; GCN-NEXT: [[GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 18 +; GCN-NEXT: [[REMAINDER_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 2, !range [[RNG1:![0-9]+]] +; GCN-NEXT: store i16 [[REMAINDER_X]], ptr addrspace(1) [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep.x = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 18 + %remainder.x = load i16, ptr addrspace(4) %gep.x, align 2 + store i16 %remainder.x, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @get_remainder_y(ptr addrspace(1) %out) #2 { +; GCN-LABEL: @get_remainder_y( +; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; GCN-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 20 +; GCN-NEXT: [[REMAINDER_Y:%.*]] = load i16, ptr addrspace(4) [[GEP_Y]], align 4, !range [[RNG1]] +; GCN-NEXT: store i16 [[REMAINDER_Y]], ptr addrspace(1) [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep.y = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 20 + %remainder.y = load i16, ptr addrspace(4) %gep.y, align 2 + store i16 %remainder.y, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @get_remainder_z(ptr addrspace(1) %out) #2 { +; GCN-LABEL: @get_remainder_z( +; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; GCN-NEXT: [[GEP_Z:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 22 +; GCN-NEXT: [[REMAINDER_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 2, !range [[RNG1]] +; GCN-NEXT: store i16 [[REMAINDER_Z]], ptr addrspace(1) [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep.z = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 22 + %remainder.z = load i16, ptr addrspace(4) %gep.z, align 2 + store i16 %remainder.z, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @get_remainder_x_uniform(ptr addrspace(1) %out) #0 { +; GCN-LABEL: @get_remainder_x_uniform( ; GCN-NEXT: store i16 0, ptr addrspace(1) [[OUT:%.*]], align 2 ; GCN-NEXT: ret void ; @@ -76,27 +120,25 @@ define amdgpu_kernel void @get_remainder_x(ptr addrspace(1) %out) #0 { ret void } -; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn -define amdgpu_kernel void @get_remainder_y(ptr addrspace(1) %out) #0 { -; GCN-LABEL: @get_remainder_y( +define amdgpu_kernel void @get_remainder_y_uniform(ptr addrspace(1) %out) #0 { +; GCN-LABEL: @get_remainder_y_uniform( ; GCN-NEXT: store i16 0, ptr addrspace(1) [[OUT:%.*]], align 2 ; GCN-NEXT: ret void ; %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - %gep.y = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 18 + %gep.y = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 20 %remainder.y = load i16, ptr addrspace(4) %gep.y, align 2 store i16 %remainder.y, ptr addrspace(1) %out ret void } -; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn -define amdgpu_kernel void @get_remainder_z(ptr addrspace(1) %out) #0 { -; GCN-LABEL: @get_remainder_z( +define amdgpu_kernel void @get_remainder_z_uniform(ptr addrspace(1) %out) #0 { +; GCN-LABEL: @get_remainder_z_uniform( ; GCN-NEXT: store i16 0, ptr addrspace(1) [[OUT:%.*]], align 2 ; GCN-NEXT: ret void ; %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - %gep.z = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 18 + %gep.z = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 22 %remainder.z = load i16, ptr addrspace(4) %gep.z, align 2 store i16 %remainder.z, ptr addrspace(1) %out ret void @@ -107,7 +149,7 @@ define amdgpu_kernel void @get_work_group_size_x(ptr addrspace(1) %out) #0 { ; GCN-LABEL: @get_work_group_size_x( ; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; GCN-NEXT: [[GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 12 -; GCN-NEXT: [[GROUP_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 4 +; GCN-NEXT: [[GROUP_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 4, !range [[RNG2:![0-9]+]] ; GCN-NEXT: store i16 [[GROUP_SIZE_X]], ptr addrspace(1) [[OUT:%.*]], align 2 ; GCN-NEXT: ret void ; @@ -123,7 +165,7 @@ define amdgpu_kernel void @get_work_group_size_y(ptr addrspace(1) %out) #0 { ; GCN-LABEL: @get_work_group_size_y( ; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; GCN-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 14 -; GCN-NEXT: [[GROUP_SIZE_Y:%.*]] = load i16, ptr addrspace(4) [[GEP_Y]], align 2 +; GCN-NEXT: [[GROUP_SIZE_Y:%.*]] = load i16, ptr addrspace(4) [[GEP_Y]], align 2, !range [[RNG2]] ; GCN-NEXT: store i16 [[GROUP_SIZE_Y]], ptr addrspace(1) [[OUT:%.*]], align 2 ; GCN-NEXT: ret void ; @@ -139,7 +181,7 @@ define amdgpu_kernel void @get_work_group_size_z(ptr addrspace(1) %out) #0 { ; GCN-LABEL: @get_work_group_size_z( ; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; GCN-NEXT: [[GEP_Z:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 16 -; GCN-NEXT: [[GROUP_SIZE_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 4 +; GCN-NEXT: [[GROUP_SIZE_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 4, !range [[RNG2]] ; GCN-NEXT: store i16 [[GROUP_SIZE_Z]], ptr addrspace(1) [[OUT:%.*]], align 2 ; GCN-NEXT: ret void ; @@ -189,6 +231,65 @@ define amdgpu_kernel void @get_work_group_size_z_reqd(ptr addrspace(1) %out) #0 ret void } +define amdgpu_kernel void @get_remainder_x_wrong_type(ptr addrspace(1) %out) #2 { +; GCN-LABEL: @get_remainder_x_wrong_type( +; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; GCN-NEXT: [[GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 18 +; GCN-NEXT: [[REMAINDER_X:%.*]] = load half, ptr addrspace(4) [[GEP_X]], align 2 +; GCN-NEXT: store half [[REMAINDER_X]], ptr addrspace(1) [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep.x = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 18 + %remainder.x = load half, ptr addrspace(4) %gep.x, align 2 + store half %remainder.x, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @get_remainder_y_wrong_type(ptr addrspace(1) %out) #2 { +; GCN-LABEL: @get_remainder_y_wrong_type( +; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; GCN-NEXT: [[GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 20 +; GCN-NEXT: [[REMAINDER_X:%.*]] = load half, ptr addrspace(4) [[GEP_X]], align 4 +; GCN-NEXT: store half [[REMAINDER_X]], ptr addrspace(1) [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep.x = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 20 + %remainder.x = load half, ptr addrspace(4) %gep.x, align 2 + store half %remainder.x, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @get_remainder_z_wrong_type(ptr addrspace(1) %out) #2 { +; GCN-LABEL: @get_remainder_z_wrong_type( +; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; GCN-NEXT: [[GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 22 +; GCN-NEXT: [[REMAINDER_X:%.*]] = load half, ptr addrspace(4) [[GEP_X]], align 2 +; GCN-NEXT: store half [[REMAINDER_X]], ptr addrspace(1) [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep.x = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 22 + %remainder.x = load half, ptr addrspace(4) %gep.x, align 2 + store half %remainder.x, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @get_remainder_x_existing_range(ptr addrspace(1) %out) #2 { +; GCN-LABEL: @get_remainder_x_existing_range( +; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; GCN-NEXT: [[GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 18 +; GCN-NEXT: [[REMAINDER_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 2, !range [[RNG4:![0-9]+]] +; GCN-NEXT: store i16 [[REMAINDER_X]], ptr addrspace(1) [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep.x = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 18 + %remainder.x = load i16, ptr addrspace(4) %gep.x, align 2, !range !{i16 0, i16 10} + store i16 %remainder.x, ptr addrspace(1) %out + ret void +} declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #1 declare i32 @llvm.amdgcn.workgroup.id.x() #1 @@ -199,5 +300,11 @@ declare i32 @llvm.amdgcn.workgroup.id.z() #1 attributes #0 = { nounwind "uniform-work-group-size" } attributes #1 = { nounwind readnone speculatable } +attributes #2 = { nounwind } !0 = !{i32 8, i32 16, i32 2} !1 = !{i32 1, !"amdhsa_code_object_version", i32 500} +;. +; GCN: [[RNG1]] = !{i16 0, i16 1023} +; GCN: [[RNG2]] = !{i16 1, i16 1024} +; GCN: [[RNG4]] = !{i16 0, i16 10} +;.