Compare commits

...

16 Commits

Author SHA1 Message Date
Aaditya
065fb26307 Propagate Constants for Wave Reduction Intrinsics 2025-08-21 15:09:08 +05:30
Aaditya
34355ed73c Add builtins for wave reduction intrinsics 2025-08-21 15:09:08 +05:30
Aaditya
460ec42cc1 Code Formating 2025-08-21 14:59:38 +05:30
Aaditya
15b3c6682f Directly checking for S_XOR_B32 2025-08-21 14:59:38 +05:30
Aaditya
9f5dfe1fad Running Clang Format 2025-08-21 14:59:37 +05:30
Aaditya
47bb973176 Removing break before else 2025-08-21 14:59:37 +05:30
Aaditya
0819895763 Removing Redundant Instructions 2025-08-21 14:59:37 +05:30
Aaditya
12c1daf0ce [AMDGPU] Extending wave reduction intrinsics for i64 types - 3
Supporting Arithemtic Operations: `and`, `or`, `xor`
2025-08-21 14:59:37 +05:30
Aaditya
e5007647e5 Adding helper function for expanding arithmetic ops. 2025-08-21 14:58:43 +05:30
Aaditya
163ae0d91e Checking for targets with native 64-bit add/sub support 2025-08-20 14:55:18 +05:30
Aaditya
991f9b6ddf Marking dead scc 2025-08-20 13:57:44 +05:30
Aaditya
6579973bcd Renaming Variables 2025-08-20 13:57:43 +05:30
Aaditya
79b9c33304 [AMDGPU] Extending wave reduction intrinsics for i64 types - 2
Supporting Arithemtic Operations: `add`, `sub`
2025-08-20 13:57:43 +05:30
Aaditya
cae47329ff Using S_MOV_B64_IMM_PSEUDO instead of dealing with legality concerns. 2025-08-20 13:57:34 +05:30
Aaditya
9362371fdc Addressing Review Comments 2025-08-13 18:31:43 +05:30
Aaditya
4277c1370b [AMDGPU] Extending wave reduction intrinsics for i64 types - 1
Supporting Min/Max Operations: `min`, `max`, `umin`, `umax`
2025-08-13 12:33:51 +05:30
16 changed files with 9764 additions and 2866 deletions

View File

@ -361,6 +361,31 @@ BUILTIN(__builtin_amdgcn_endpgm, "v", "nr")
BUILTIN(__builtin_amdgcn_get_fpenv, "WUi", "n")
BUILTIN(__builtin_amdgcn_set_fpenv, "vWUi", "n")
//===----------------------------------------------------------------------===//
// Wave Reduction builtins.
//===----------------------------------------------------------------------===//
BUILTIN(__builtin_amdgcn_wave_reduce_add_u32, "ZUiZUiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_sub_u32, "ZUiZUiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_min_i32, "ZiZiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_min_u32, "ZUiZUiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_max_i32, "ZiZiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_max_u32, "ZUiZUiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_and_b32, "ZiZiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_or_b32, "ZiZiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_xor_b32, "ZiZiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_add_u64, "WUiWUiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_sub_u64, "WUiWUiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_min_i64, "WiWiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_min_u64, "WUiWUiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_max_i64, "WiWiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_max_u64, "WUiWUiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_and_b64, "WiWiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_or_b64, "WiWiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_xor_b64, "WiWiZi", "nc")
//===----------------------------------------------------------------------===//
// R600-NI only builtins.
//===----------------------------------------------------------------------===//

View File

@ -295,11 +295,69 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs));
}
static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
switch (BuiltinID) {
default:
llvm_unreachable("Unknown BuiltinID for wave reduction");
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64:
return Intrinsic::amdgcn_wave_reduce_add;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64:
return Intrinsic::amdgcn_wave_reduce_sub;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64:
return Intrinsic::amdgcn_wave_reduce_min;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64:
return Intrinsic::amdgcn_wave_reduce_umin;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64:
return Intrinsic::amdgcn_wave_reduce_max;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64:
return Intrinsic::amdgcn_wave_reduce_umax;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b64:
return Intrinsic::amdgcn_wave_reduce_and;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b64:
return Intrinsic::amdgcn_wave_reduce_or;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b64:
return Intrinsic::amdgcn_wave_reduce_xor;
}
}
Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
const CallExpr *E) {
llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
llvm::SyncScope::ID SSID;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b64:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b64:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b64: {
Intrinsic::ID IID = getIntrinsicIDforWaveReduction(BuiltinID);
llvm::Value *Value = EmitScalarExpr(E->getArg(0));
llvm::Value *Strategy = EmitScalarExpr(E->getArg(1));
llvm::Function *F = CGM.getIntrinsic(IID, {Value->getType()});
return Builder.CreateCall(F, {Value, Strategy});
}
case AMDGPU::BI__builtin_amdgcn_div_scale:
case AMDGPU::BI__builtin_amdgcn_div_scalef: {
// Translate from the intrinsics's struct return to the builtin's out

View File

@ -398,6 +398,384 @@ void test_s_sendmsghalt_var(int in)
__builtin_amdgcn_s_sendmsghalt(1, in);
}
// CHECK-LABEL: @test_wave_reduce_add_u32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32(
void test_wave_reduce_add_u32_default(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_add_u32(in, 0);
}
// CHECK-LABEL: @test_wave_reduce_add_u64_default
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.add.i64(
void test_wave_reduce_add_u64_default(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_add_u64(in, 0);
}
// CHECK-LABEL: @test_wave_reduce_add_u32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32(
void test_wave_reduce_add_u32_iterative(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_add_u32(in, 1);
}
// CHECK-LABEL: @test_wave_reduce_add_u64_iterative
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.add.i64(
void test_wave_reduce_add_u64_iterative(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_add_u64(in, 1);
}
// CHECK-LABEL: @test_wave_reduce_add_u32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32(
void test_wave_reduce_add_u32_dpp(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_add_u32(in, 2);
}
// CHECK-LABEL: @test_wave_reduce_add_u64_dpp
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.add.i64(
void test_wave_reduce_add_u64_dpp(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_add_u64(in, 2);
}
// CHECK-LABEL: @test_wave_reduce_sub_u32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
void test_wave_reduce_sub_u32_default(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_sub_u32(in, 0);
}
// CHECK-LABEL: @test_wave_reduce_sub_u64_default
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.sub.i64(
void test_wave_reduce_sub_u64_default(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_sub_u64(in, 0);
}
// CHECK-LABEL: @test_wave_reduce_sub_u32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
void test_wave_reduce_sub_u32_iterative(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_sub_u32(in, 1);
}
// CHECK-LABEL: @test_wave_reduce_sub_u64_iterative
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.sub.i64(
void test_wave_reduce_sub_u64_iterative(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_sub_u64(in, 1);
}
// CHECK-LABEL: @test_wave_reduce_sub_u32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
void test_wave_reduce_sub_u32_dpp(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_sub_u32(in, 2);
}
// CHECK-LABEL: @test_wave_reduce_sub_u64_dpp
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.sub.i64(
void test_wave_reduce_sub_u64_dpp(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_sub_u64(in, 2);
}
// CHECK-LABEL: @test_wave_reduce_and_b32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32(
void test_wave_reduce_and_b32_default(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_and_b32(in, 0);
}
// CHECK-LABEL: @test_wave_reduce_and_b64_default
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.and.i64(
void test_wave_reduce_and_b64_default(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_and_b64(in, 0);
}
// CHECK-LABEL: @test_wave_reduce_and_b32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32(
void test_wave_reduce_and_b32_iterative(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_and_b32(in, 1);
}
// CHECK-LABEL: @test_wave_reduce_and_b64_iterative
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.and.i64(
void test_wave_reduce_and_b64_iterative(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_and_b64(in, 1);
}
// CHECK-LABEL: @test_wave_reduce_and_b32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32(
void test_wave_reduce_and_b32_dpp(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_and_b32(in, 2);
}
// CHECK-LABEL: @test_wave_reduce_and_b64_dpp
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.and.i64(
void test_wave_reduce_and_b64_dpp(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_and_b64(in, 2);
}
// CHECK-LABEL: @test_wave_reduce_or_b32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.or.i32(
void test_wave_reduce_or_b32_default(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_or_b32(in, 0);
}
// CHECK-LABEL: @test_wave_reduce_or_b64_default
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.or.i64(
void test_wave_reduce_or_b64_default(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_or_b64(in, 0);
}
// CHECK-LABEL: @test_wave_reduce_or_b32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.or.i32(
void test_wave_reduce_or_b32_iterative(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_or_b32(in, 1);
}
// CHECK-LABEL: @test_wave_reduce_or_b64_iterative
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.or.i64(
void test_wave_reduce_or_b64_iterative(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_or_b64(in, 1);
}
// CHECK-LABEL: @test_wave_reduce_or_b32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.or.i32(
void test_wave_reduce_or_b32_dpp(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_or_b32(in, 2);
}
// CHECK-LABEL: @test_wave_reduce_or_b64_dpp
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.or.i64(
void test_wave_reduce_or_b64_dpp(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_or_b64(in, 2);
}
// CHECK-LABEL: @test_wave_reduce_xor_b32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.xor.i32(
void test_wave_reduce_xor_b32_default(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_xor_b32(in, 0);
}
// CHECK-LABEL: @test_wave_reduce_xor_b64_default
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.xor.i64(
void test_wave_reduce_xor_b64_default(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_xor_b64(in, 0);
}
// CHECK-LABEL: @test_wave_reduce_xor_b32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.xor.i32(
void test_wave_reduce_xor_b32_iterative(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_xor_b32(in, 1);
}
// CHECK-LABEL: @test_wave_reduce_xor_b64_iterative
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.xor.i64(
void test_wave_reduce_xor_b64_iterative(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_xor_b64(in, 1);
}
// CHECK-LABEL: @test_wave_reduce_xor_b32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.xor.i32(
void test_wave_reduce_xor_b32_dpp(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_xor_b32(in, 2);
}
// CHECK-LABEL: @test_wave_reduce_xor_b64_dpp
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.xor.i64(
void test_wave_reduce_xor_b64_dpp(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_xor_b64(in, 2);
}
// CHECK-LABEL: @test_wave_reduce_min_i32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32(
void test_wave_reduce_min_i32_default(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_min_i32(in, 0);
}
// CHECK-LABEL: @test_wave_reduce_min_i64_default
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.min.i64(
void test_wave_reduce_min_i64_default(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_min_i64(in, 0);
}
// CHECK-LABEL: @test_wave_reduce_min_i32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32(
void test_wave_reduce_min_i32_iterative(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_min_i32(in, 1);
}
// CHECK-LABEL: @test_wave_reduce_min_i64_iterative
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.min.i64(
void test_wave_reduce_min_i64_iterative(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_min_i64(in, 1);
}
// CHECK-LABEL: @test_wave_reduce_min_i32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32(
void test_wave_reduce_min_i32_dpp(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_min_i32(in, 2);
}
// CHECK-LABEL: @test_wave_reduce_min_i64_dpp
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.min.i64(
void test_wave_reduce_min_i64_dpp(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_min_i64(in, 2);
}
// CHECK-LABEL: @test_wave_reduce_min_u32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32(
void test_wave_reduce_min_u32_default(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_min_u32(in, 0);
}
// CHECK-LABEL: @test_wave_reduce_min_u64_default
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.umin.i64(
void test_wave_reduce_min_u64_default(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_min_u64(in, 0);
}
// CHECK-LABEL: @test_wave_reduce_min_u32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32(
void test_wave_reduce_min_u32_iterative(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_min_u32(in, 1);
}
// CHECK-LABEL: @test_wave_reduce_min_u64_iterative
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.umin.i64(
void test_wave_reduce_min_u64_iterative(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_min_u64(in, 1);
}
// CHECK-LABEL: @test_wave_reduce_min_u32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32(
void test_wave_reduce_min_u32_dpp(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_min_u32(in, 2);
}
// CHECK-LABEL: @test_wave_reduce_min_u64_dpp
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.umin.i64(
void test_wave_reduce_min_u64_dpp(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_min_u64(in, 2);
}
// CHECK-LABEL: @test_wave_reduce_max_i32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32(
void test_wave_reduce_max_i32_default(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_max_i32(in, 0);
}
// CHECK-LABEL: @test_wave_reduce_max_i64_default
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.max.i64(
void test_wave_reduce_max_i64_default(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_max_i64(in, 0);
}
// CHECK-LABEL: @test_wave_reduce_max_i32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32(
void test_wave_reduce_max_i32_iterative(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_max_i32(in, 1);
}
// CHECK-LABEL: @test_wave_reduce_max_i64_iterative
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.max.i64(
void test_wave_reduce_max_i64_iterative(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_max_i64(in, 1);
}
// CHECK-LABEL: @test_wave_reduce_max_i32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32(
void test_wave_reduce_max_i32_dpp(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_max_i32(in, 2);
}
// CHECK-LABEL: @test_wave_reduce_max_i64_dpp
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.max.i64(
void test_wave_reduce_max_i64_dpp(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_max_i64(in, 2);
}
// CHECK-LABEL: @test_wave_reduce_max_u32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32(
void test_wave_reduce_max_u32_default(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_max_u32(in, 0);
}
// CHECK-LABEL: @test_wave_reduce_max_u64_default
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.umax.i64(
void test_wave_reduce_max_u64_default(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_max_u64(in, 0);
}
// CHECK-LABEL: @test_wave_reduce_max_u32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32(
void test_wave_reduce_max_u32_iterative(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_max_u32(in, 1);
}
// CHECK-LABEL: @test_wave_reduce_max_u64_iterative
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.umax.i64(
void test_wave_reduce_max_u64_iterative(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_max_u64(in, 1);
}
// CHECK-LABEL: @test_wave_reduce_max_u32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32(
void test_wave_reduce_max_u32_dpp(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_max_u32(in, 2);
}
// CHECK-LABEL: @test_wave_reduce_max_u64_dpp
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.umax.i64(
void test_wave_reduce_max_u64_dpp(global int* out, long in)
{
*out = __builtin_amdgcn_wave_reduce_max_u64(in, 2);
}
// CHECK-LABEL: @test_s_barrier
// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.barrier(
void test_s_barrier()

14
llvm/lib/Analysis/ConstantFolding.cpp Normal file → Executable file
View File

@ -1652,6 +1652,13 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
case Intrinsic::amdgcn_perm:
case Intrinsic::amdgcn_wave_reduce_umin:
case Intrinsic::amdgcn_wave_reduce_umax:
case Intrinsic::amdgcn_wave_reduce_max:
case Intrinsic::amdgcn_wave_reduce_min:
case Intrinsic::amdgcn_wave_reduce_add:
case Intrinsic::amdgcn_wave_reduce_sub:
case Intrinsic::amdgcn_wave_reduce_and:
case Intrinsic::amdgcn_wave_reduce_or:
case Intrinsic::amdgcn_wave_reduce_xor:
case Intrinsic::amdgcn_s_wqm:
case Intrinsic::amdgcn_s_quadmask:
case Intrinsic::amdgcn_s_bitreplicate:
@ -3525,6 +3532,13 @@ static Constant *ConstantFoldIntrinsicCall2(Intrinsic::ID IntrinsicID, Type *Ty,
return ConstantInt::get(Ty, C0->abs());
case Intrinsic::amdgcn_wave_reduce_umin:
case Intrinsic::amdgcn_wave_reduce_umax:
case Intrinsic::amdgcn_wave_reduce_max:
case Intrinsic::amdgcn_wave_reduce_min:
case Intrinsic::amdgcn_wave_reduce_add:
case Intrinsic::amdgcn_wave_reduce_sub:
case Intrinsic::amdgcn_wave_reduce_and:
case Intrinsic::amdgcn_wave_reduce_or:
case Intrinsic::amdgcn_wave_reduce_xor:
return dyn_cast<Constant>(Operands[0]);
}

View File

@ -5192,7 +5192,59 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
return LoopBB;
}
static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
static MachineBasicBlock *Expand64BitScalarArithmetic(MachineInstr &MI,
MachineBasicBlock *BB) {
// For targets older than GFX12, we emit a sequence of 32-bit operations.
// For GFX12, we emit s_add_u64 and s_sub_u64.
MachineFunction *MF = BB->getParent();
const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
const DebugLoc &DL = MI.getDebugLoc();
MachineOperand &Dest = MI.getOperand(0);
MachineOperand &Src0 = MI.getOperand(1);
MachineOperand &Src1 = MI.getOperand(2);
bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
if (ST.hasScalarAddSub64()) {
unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
// clang-format off
BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
.add(Src0)
.add(Src1);
// clang-format on
} else {
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
.addReg(DestSub0)
.addImm(AMDGPU::sub0)
.addReg(DestSub1)
.addImm(AMDGPU::sub1);
}
MI.eraseFromParent();
return BB;
}
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
switch (Opc) {
case AMDGPU::S_MIN_U32:
return std::numeric_limits<uint32_t>::max();
@ -5210,7 +5262,31 @@ static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
case AMDGPU::S_AND_B32:
return std::numeric_limits<uint32_t>::max();
default:
llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
llvm_unreachable(
"Unexpected opcode in getIdentityValueFor32BitWaveReduction");
}
}
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc) {
switch (Opc) {
case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
return std::numeric_limits<uint64_t>::max();
case AMDGPU::V_CMP_LT_I64_e64: // min.i64
return std::numeric_limits<int64_t>::max();
case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
return std::numeric_limits<uint64_t>::min();
case AMDGPU::V_CMP_GT_I64_e64: // max.i64
return std::numeric_limits<int64_t>::min();
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_U64_PSEUDO:
case AMDGPU::S_OR_B64:
case AMDGPU::S_XOR_B64:
return std::numeric_limits<uint64_t>::min();
case AMDGPU::S_AND_B64:
return std::numeric_limits<uint64_t>::max();
default:
llvm_unreachable(
"Unexpected opcode in getIdentityValueFor64BitWaveReduction");
}
}
@ -5241,53 +5317,99 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
RetBB = &BB;
break;
}
case AMDGPU::V_CMP_LT_U64_e64: // umin
case AMDGPU::V_CMP_LT_I64_e64: // min
case AMDGPU::V_CMP_GT_U64_e64: // umax
case AMDGPU::V_CMP_GT_I64_e64: // max
case AMDGPU::S_AND_B64:
case AMDGPU::S_OR_B64: {
// Idempotent operations.
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
RetBB = &BB;
break;
}
case AMDGPU::S_XOR_B32:
case AMDGPU::S_XOR_B64:
case AMDGPU::S_ADD_I32:
case AMDGPU::S_SUB_I32: {
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_I32:
case AMDGPU::S_SUB_U64_PSEUDO: {
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
Register NumActiveLanes =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
bool IsWave32 = ST.isWave32();
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
unsigned CountReg =
unsigned BitCountOpc =
IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
auto Exec =
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
.addReg(Exec->getOperand(0).getReg());
auto NewAccumulator =
BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
.addReg(ExecMask);
switch (Opc) {
case AMDGPU::S_XOR_B32: {
case AMDGPU::S_XOR_B32:
case AMDGPU::S_XOR_B64: {
// Performing an XOR operation on a uniform value
// depends on the parity of the number of active lanes.
// For even parity, the result will be 0, for odd
// parity the result will be the same as the input value.
Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
Register ParityRegister =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
auto ParityReg =
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
.addReg(NewAccumulator->getOperand(0).getReg())
.addImm(1);
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(SrcReg)
.addReg(ParityReg->getOperand(0).getReg());
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
.addReg(NewAccumulator->getOperand(0).getReg())
.addImm(1)
.setOperandDead(3); // Dead scc
if (Opc == AMDGPU::S_XOR_B32) {
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(SrcReg)
.addReg(ParityRegister);
} else {
Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
const TargetRegisterClass *SrcSubRC =
TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
MachineOperand Op1L = TII->buildExtractSubRegOrImm(
MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
MachineOperand Op1H = TII->buildExtractSubRegOrImm(
MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
.add(Op1L)
.addReg(ParityRegister);
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
.add(Op1H)
.addReg(ParityRegister);
BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
.addReg(DestSub0)
.addImm(AMDGPU::sub0)
.addReg(DestSub1)
.addImm(AMDGPU::sub1);
}
break;
}
case AMDGPU::S_SUB_I32: {
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
// Take the negation of the source operand.
auto InvertedValReg =
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
.addImm(-1)
.addReg(SrcReg);
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
.addImm(0)
.addReg(SrcReg);
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(InvertedValReg->getOperand(0).getReg())
.addReg(NegatedVal)
.addReg(NewAccumulator->getOperand(0).getReg());
break;
}
@ -5297,6 +5419,75 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
.addReg(NewAccumulator->getOperand(0).getReg());
break;
}
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_U64_PSEUDO: {
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register Op1H_Op0L_Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register Op1L_Op0H_Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register NegatedValLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register NegatedValHi =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
const TargetRegisterClass *Src1SubRC =
TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
MachineOperand Op1L = TII->buildExtractSubRegOrImm(
MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
MachineOperand Op1H = TII->buildExtractSubRegOrImm(
MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
.addImm(0)
.addReg(NewAccumulator->getOperand(0).getReg())
.setOperandDead(3); // Dead scc
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
.addReg(NegatedValLo)
.addImm(31)
.setOperandDead(3); // Dead scc
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
.add(Op1L)
.addReg(NegatedValHi);
}
Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
? NegatedValLo
: NewAccumulator->getOperand(0).getReg();
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
.add(Op1L)
.addReg(LowOpcode);
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
.add(Op1L)
.addReg(LowOpcode);
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
.add(Op1H)
.addReg(LowOpcode);
Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
.addReg(CarryReg)
.addReg(Op1H_Op0L_Reg)
.setOperandDead(3); // Dead scc
if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
.addReg(HiVal)
.addReg(Op1L_Op0H_Reg)
.setOperandDead(3); // Dead scc
}
BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
.addReg(DestSub0)
.addImm(AMDGPU::sub0)
.addReg(DestSub1)
.addImm(AMDGPU::sub1);
break;
}
}
RetBB = &BB;
}
@ -5313,6 +5504,11 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
// so that we will get the next active lane for next iteration.
MachineBasicBlock::iterator I = BB.end();
Register SrcReg = MI.getOperand(1).getReg();
bool is32BitOpc = (Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
Opc == AMDGPU::S_XOR_B32);
// Create Control flow for loop
// Split MI's Machine Basic block into For loop
@ -5322,73 +5518,162 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
Register LaneValueReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
bool IsWave32 = ST.isWave32();
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
// Create initial values of induction variable from Exec, Accumulator and
// insert branch instr to newly created ComputeBlock
uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
auto TmpSReg =
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
.addImm(InitalValue);
BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
if (is32BitOpc) {
uint32_t IdentityValue = getIdentityValueFor32BitWaveReduction(Opc);
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
.addImm(IdentityValue);
} else {
uint64_t IdentityValue = getIdentityValueFor64BitWaveReduction(Opc);
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
.addImm(IdentityValue);
}
// clang-format off
BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
.addMBB(ComputeLoop);
// clang-format on
// Start constructing ComputeLoop
I = ComputeLoop->end();
I = ComputeLoop->begin();
auto Accumulator =
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
.addReg(InitalValReg)
.addReg(IdentityValReg)
.addMBB(&BB);
auto ActiveBits =
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
.addReg(TmpSReg->getOperand(0).getReg())
.addReg(LoopIterator)
.addMBB(&BB);
I = ComputeLoop->end();
MachineInstr *NewAccumulator;
// Perform the computations
unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
.addReg(ActiveBits->getOperand(0).getReg());
auto LaneValue = BuildMI(*ComputeLoop, I, DL,
TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
.addReg(SrcReg)
.addReg(FF1->getOperand(0).getReg());
auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
.addReg(Accumulator->getOperand(0).getReg())
.addReg(LaneValue->getOperand(0).getReg());
BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
.addReg(ActiveBitsReg);
if (is32BitOpc) {
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
LaneValueReg)
.addReg(SrcReg)
.addReg(FF1Reg);
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
.addReg(Accumulator->getOperand(0).getReg())
.addReg(LaneValueReg);
} else {
Register LaneValueLoReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
Register LaneValueHiReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
const TargetRegisterClass *SrcSubRC =
TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
MachineOperand Op1L = TII->buildExtractSubRegOrImm(
MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
MachineOperand Op1H = TII->buildExtractSubRegOrImm(
MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
// lane value input should be in an sgpr
MachineInstr *LaneValueLo =
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
LaneValueLoReg)
.add(Op1L)
.addReg(FF1Reg);
MachineInstr *LaneValueHi =
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
LaneValueHiReg)
.add(Op1H)
.addReg(FF1Reg);
auto LaneValue = BuildMI(*ComputeLoop, I, DL,
TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
.addReg(LaneValueLoReg)
.addImm(AMDGPU::sub0)
.addReg(LaneValueHiReg)
.addImm(AMDGPU::sub1);
switch (Opc) {
case AMDGPU::S_OR_B64:
case AMDGPU::S_AND_B64:
case AMDGPU::S_XOR_B64: {
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
.addReg(Accumulator->getOperand(0).getReg())
.addReg(LaneValue->getOperand(0).getReg())
.setOperandDead(3); // Dead scc
break;
}
case AMDGPU::V_CMP_GT_I64_e64:
case AMDGPU::V_CMP_GT_U64_e64:
case AMDGPU::V_CMP_LT_I64_e64:
case AMDGPU::V_CMP_LT_U64_e64: {
Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
Register ComparisonResultReg =
MRI.createVirtualRegister(WaveMaskRegClass);
const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
const TargetRegisterClass *VSubRegClass =
TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
MachineOperand SrcReg0Sub0 =
TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
VregClass, AMDGPU::sub0, VSubRegClass);
MachineOperand SrcReg0Sub1 =
TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
VregClass, AMDGPU::sub1, VSubRegClass);
BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
AccumulatorVReg)
.add(SrcReg0Sub0)
.addImm(AMDGPU::sub0)
.add(SrcReg0Sub1)
.addImm(AMDGPU::sub1);
BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
.addReg(LaneValue->getOperand(0).getReg())
.addReg(AccumulatorVReg);
unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
.addReg(LaneMaskReg)
.addReg(ActiveBitsReg);
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
TII->get(AMDGPU::S_CSELECT_B64), DstReg)
.addReg(LaneValue->getOperand(0).getReg())
.addReg(Accumulator->getOperand(0).getReg());
break;
}
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_U64_PSEUDO: {
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
.addReg(Accumulator->getOperand(0).getReg())
.addReg(LaneValue->getOperand(0).getReg());
ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
break;
}
}
}
// Manipulate the iterator to get the next active lane
unsigned BITSETOpc =
IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
auto NewActiveBits =
BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
.addReg(FF1->getOperand(0).getReg())
.addReg(ActiveBits->getOperand(0).getReg());
BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
.addReg(FF1Reg)
.addReg(ActiveBitsReg);
// Add phi nodes
Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
.addMBB(ComputeLoop);
ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
.addMBB(ComputeLoop);
Accumulator.addReg(DstReg).addMBB(ComputeLoop);
ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
// Creating branching
unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
.addReg(NewActiveBits->getOperand(0).getReg())
.addReg(NewActiveBitsReg)
.addImm(0);
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
.addMBB(ComputeLoop);
@ -5410,22 +5695,40 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
switch (MI.getOpcode()) {
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
case AMDGPU::S_UADDO_PSEUDO:
case AMDGPU::S_USUBO_PSEUDO: {
const DebugLoc &DL = MI.getDebugLoc();
@ -5452,55 +5755,7 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
}
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_U64_PSEUDO: {
// For targets older than GFX12, we emit a sequence of 32-bit operations.
// For GFX12, we emit s_add_u64 and s_sub_u64.
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
const DebugLoc &DL = MI.getDebugLoc();
MachineOperand &Dest = MI.getOperand(0);
MachineOperand &Src0 = MI.getOperand(1);
MachineOperand &Src1 = MI.getOperand(2);
bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
if (Subtarget->hasScalarAddSub64()) {
unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
// clang-format off
BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
.add(Src0)
.add(Src1);
// clang-format on
} else {
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
.add(Src0Sub0)
.add(Src1Sub0);
BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
.add(Src0Sub1)
.add(Src1Sub1);
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
.addReg(DestSub0)
.addImm(AMDGPU::sub0)
.addReg(DestSub1)
.addImm(AMDGPU::sub1);
}
MI.eraseFromParent();
return BB;
return Expand64BitScalarArithmetic(MI, BB);
}
case AMDGPU::V_ADD_U64_PSEUDO:
case AMDGPU::V_SUB_U64_PSEUDO: {

View File

@ -304,28 +304,57 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
// clang-format off
defvar int_amdgcn_wave_reduce_ = "int_amdgcn_wave_reduce_";
multiclass
AMDGPUWaveReducePseudoGenerator<string Op, string DataType> {
AMDGPUWaveReducePseudoGenerator<string Op, string DataType, ValueType ty, RegisterClass RetReg, SrcRegOrImm9 Reg> {
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
def !toupper(Op) #"_PSEUDO_" #DataType
: VPseudoInstSI<(outs SGPR_32 : $sdst),
(ins VSrc_b32 : $src, VSrc_b32 : $strategy),
[(set i32 : $sdst, (!cast<AMDGPUWaveReduce>(int_amdgcn_wave_reduce_ #Op) i32 : $src, i32 : $strategy))]> {}
: VPseudoInstSI<(outs RetReg : $sdst),
(ins Reg : $src, VSrc_b32 : $strategy),
[(set ty : $sdst, (!cast<AMDGPUWaveReduce>("int_amdgcn_wave_reduce_" #Op) ty : $src, i32 : $strategy))]> {}
}
}
// clang-format on
class WaveReduceOp<string OpName, string TypeStr, ValueType Ty,
RegisterClass ReturnRegisterClass, SrcRegOrImm9 RC> {
string Name = OpName;
string TypeString = TypeStr;
ValueType VT = Ty;
RegisterClass RetReg = ReturnRegisterClass;
SrcRegOrImm9 Reg = RC;
}
// Input list : [Operation_name,
// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B)]
// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B),
// bit-width
// output register class,
// input register class]
defvar Operations = [
["umin", "U32"], ["min", "I32"], ["umax", "U32"], ["max", "I32"],
["add", "I32"], ["sub", "I32"], ["and", "B32"], ["or", "B32"],
["xor", "B32"]
WaveReduceOp<"umin", "U32", i32, SGPR_32, VSrc_b32>,
WaveReduceOp<"min", "I32", i32, SGPR_32, VSrc_b32>,
WaveReduceOp<"umax", "U32", i32, SGPR_32, VSrc_b32>,
WaveReduceOp<"max", "I32", i32, SGPR_32, VSrc_b32>,
WaveReduceOp<"add", "I32", i32, SGPR_32, VSrc_b32>,
WaveReduceOp<"sub", "I32", i32, SGPR_32, VSrc_b32>,
WaveReduceOp<"and", "B32", i32, SGPR_32, VSrc_b32>,
WaveReduceOp<"or", "B32", i32, SGPR_32, VSrc_b32>,
WaveReduceOp<"xor", "B32", i32, SGPR_32, VSrc_b32>,
WaveReduceOp<"umin", "U64", i64, SGPR_64, VSrc_b64>,
WaveReduceOp<"min", "I64", i64, SGPR_64, VSrc_b64>,
WaveReduceOp<"umax", "U64", i64, SGPR_64, VSrc_b64>,
WaveReduceOp<"max", "I64", i64, SGPR_64, VSrc_b64>,
WaveReduceOp<"add", "U64", i64, SGPR_64, VSrc_b64>,
WaveReduceOp<"sub", "U64", i64, SGPR_64, VSrc_b64>,
WaveReduceOp<"and", "B64", i64, SGPR_64, VSrc_b64>,
WaveReduceOp<"or", "B64", i64, SGPR_64, VSrc_b64>,
WaveReduceOp<"xor", "B64", i64, SGPR_64, VSrc_b64>,
];
foreach Op = Operations in {
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op[0], Op[1]>;
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op.Name, Op.TypeString,
Op.VT, Op.RetReg, Op.Reg>;
}
let usesCustomInserter = 1, Defs = [VCC] in {

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -2,13 +2,14 @@
; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
; --------------------------------------------------------------------
; llvm.amdgcn.wave.reduce.umin.i32
; llvm.amdgcn.wave.reduce.umin
; --------------------------------------------------------------------
declare i32 @llvm.amdgcn.wave.reduce.umin.i32(i32, i32 immarg)
declare i64 @llvm.amdgcn.wave.reduce.umin.i64(i64, i32 immarg)
define amdgpu_kernel void @test_constant_fold_wave_reduce_umin_poison(ptr addrspace(1) %out, i32 %in) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_umin_poison(
define amdgpu_kernel void @test_constant_fold_wave_reduce_umin_i32_poison(ptr addrspace(1) %out, i32 %in) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_umin_i32_poison(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i32 poison, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
@ -19,8 +20,8 @@ entry:
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_umin_const(ptr addrspace(1) %out) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_umin_const(
define amdgpu_kernel void @test_constant_fold_wave_reduce_umin_i32_const(ptr addrspace(1) %out) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_umin_i32_const(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i32 123, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
@ -31,14 +32,94 @@ entry:
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_umin_i64_poison(ptr addrspace(1) %out, i64 %in) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_umin_i64_poison(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i64 poison, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i64 @llvm.amdgcn.wave.reduce.umin.i64(i64 poison, i32 1)
store i64 %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_umin_i64_const(ptr addrspace(1) %out) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_umin_i64_const(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i64 123, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i64 @llvm.amdgcn.wave.reduce.umin.i64(i64 123, i32 1)
store i64 %result, ptr addrspace(1) %out
ret void
}
; --------------------------------------------------------------------
; llvm.amdgcn.wave.reduce.umin.i32
; llvm.amdgcn.wave.reduce.min
; --------------------------------------------------------------------
declare i32 @llvm.amdgcn.wave.reduce.min.i32(i32, i32 immarg)
declare i64 @llvm.amdgcn.wave.reduce.min.i64(i64, i32 immarg)
define amdgpu_kernel void @test_constant_fold_wave_reduce_min_i32_poison(ptr addrspace(1) %out, i32 %in) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_min_i32_poison(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i32 poison, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 poison, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_min_i32_const(ptr addrspace(1) %out) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_min_i32_const(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i32 123, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 123, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_min_i64_poison(ptr addrspace(1) %out, i64 %in) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_min_i64_poison(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i64 poison, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i64 @llvm.amdgcn.wave.reduce.min.i64(i64 poison, i32 1)
store i64 %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_min_i64_const(ptr addrspace(1) %out) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_min_i64_const(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i64 123, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i64 @llvm.amdgcn.wave.reduce.min.i64(i64 123, i32 1)
store i64 %result, ptr addrspace(1) %out
ret void
}
; --------------------------------------------------------------------
; llvm.amdgcn.wave.reduce.umax
; --------------------------------------------------------------------
declare i32 @llvm.amdgcn.wave.reduce.umax.i32(i32, i32 immarg)
declare i64 @llvm.amdgcn.wave.reduce.umax.i64(i64, i32 immarg)
define amdgpu_kernel void @test_constant_fold_wave_reduce_umax_poison(ptr addrspace(1) %out, i32 %in) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_umax_poison(
define amdgpu_kernel void @test_constant_fold_wave_reduce_umax_i32_poison(ptr addrspace(1) %out, i32 %in) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_umax_i32_poison(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i32 poison, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
@ -49,8 +130,8 @@ entry:
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_umax_const(ptr addrspace(1) %out) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_umax_const(
define amdgpu_kernel void @test_constant_fold_wave_reduce_umax_i32_const(ptr addrspace(1) %out) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_umax_i32_const(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i32 123, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
@ -61,6 +142,30 @@ entry:
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_umax_i64_poison(ptr addrspace(1) %out, i32 %in) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_umax_i64_poison(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i64 poison, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i64 @llvm.amdgcn.wave.reduce.umax.i64(i64 poison, i32 1)
store i64 %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_umax_i64_const(ptr addrspace(1) %out) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_umax_i64_const(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i64 123, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i64 @llvm.amdgcn.wave.reduce.umax.i64(i64 123, i32 1)
store i64 %result, ptr addrspace(1) %out
ret void
}
@gv = constant i32 0
define amdgpu_kernel void @test_constant_fold_wave_reduce_umax_gv(ptr addrspace(1) %out) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_umax_gv(
@ -74,3 +179,333 @@ entry:
store i32 %result, ptr addrspace(1) %out
ret void
}
; --------------------------------------------------------------------
; llvm.amdgcn.wave.reduce.max
; --------------------------------------------------------------------
declare i32 @llvm.amdgcn.wave.reduce.max.i32(i32, i32 immarg)
declare i64 @llvm.amdgcn.wave.reduce.max.i64(i64, i32 immarg)
define amdgpu_kernel void @test_constant_fold_wave_reduce_max_i32_poison(ptr addrspace(1) %out, i32 %in) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_max_i32_poison(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i32 poison, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 poison, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_max_i32_const(ptr addrspace(1) %out) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_max_i32_const(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i32 123, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 123, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_max_i64_poison(ptr addrspace(1) %out, i64 %in) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_max_i64_poison(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i64 poison, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i64 @llvm.amdgcn.wave.reduce.max.i64(i64 poison, i32 1)
store i64 %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_max_i64_const(ptr addrspace(1) %out) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_max_i64_const(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i64 123, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i64 @llvm.amdgcn.wave.reduce.max.i64(i64 123, i32 1)
store i64 %result, ptr addrspace(1) %out
ret void
}
; --------------------------------------------------------------------
; llvm.amdgcn.wave.reduce.add
; --------------------------------------------------------------------
declare i32 @llvm.amdgcn.wave.reduce.add.i32(i32, i32 immarg)
declare i64 @llvm.amdgcn.wave.reduce.add.i64(i64, i32 immarg)
define amdgpu_kernel void @test_constant_fold_wave_reduce_add_i32_poison(ptr addrspace(1) %out, i32 %in) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_add_i32_poison(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i32 poison, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 poison, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_add_i32_const(ptr addrspace(1) %out) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_add_i32_const(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i32 123, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 123, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_add_i64_poison(ptr addrspace(1) %out, i64 %in) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_add_i64_poison(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i64 poison, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 poison, i32 1)
store i64 %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_add_i64_const(ptr addrspace(1) %out) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_add_i64_const(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i64 123, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 123, i32 1)
store i64 %result, ptr addrspace(1) %out
ret void
}
; --------------------------------------------------------------------
; llvm.amdgcn.wave.reduce.sub
; --------------------------------------------------------------------
declare i32 @llvm.amdgcn.wave.reduce.sub.i32(i32, i32 immarg)
declare i64 @llvm.amdgcn.wave.reduce.sub.i64(i64, i32 immarg)
define amdgpu_kernel void @test_constant_fold_wave_reduce_sub_i32_poison(ptr addrspace(1) %out, i32 %in) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_sub_i32_poison(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i32 poison, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 poison, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_sub_i32_const(ptr addrspace(1) %out) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_sub_i32_const(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i32 123, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 123, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_sub_i64_poison(ptr addrspace(1) %out, i64 %in) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_sub_i64_poison(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i64 poison, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i64 @llvm.amdgcn.wave.reduce.sub.i64(i64 poison, i32 1)
store i64 %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_sub_i64_const(ptr addrspace(1) %out) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_sub_i64_const(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i64 123, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i64 @llvm.amdgcn.wave.reduce.sub.i64(i64 123, i32 1)
store i64 %result, ptr addrspace(1) %out
ret void
}
; --------------------------------------------------------------------
; llvm.amdgcn.wave.reduce.and
; --------------------------------------------------------------------
declare i32 @llvm.amdgcn.wave.reduce.and.i32(i32, i32 immarg)
declare i64 @llvm.amdgcn.wave.reduce.and.i64(i64, i32 immarg)
define amdgpu_kernel void @test_constant_fold_wave_reduce_and_i32_poison(ptr addrspace(1) %out, i32 %in) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_and_i32_poison(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i32 poison, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 poison, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_and_i32_const(ptr addrspace(1) %out) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_and_i32_const(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i32 123, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 123, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_and_i64_poison(ptr addrspace(1) %out, i64 %in) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_and_i64_poison(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i64 poison, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i64 @llvm.amdgcn.wave.reduce.and.i64(i64 poison, i32 1)
store i64 %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_and_i64_const(ptr addrspace(1) %out) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_and_i64_const(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i64 123, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i64 @llvm.amdgcn.wave.reduce.and.i64(i64 123, i32 1)
store i64 %result, ptr addrspace(1) %out
ret void
}
; --------------------------------------------------------------------
; llvm.amdgcn.wave.reduce.or
; --------------------------------------------------------------------
declare i32 @llvm.amdgcn.wave.reduce.or.i32(i32, i32 immarg)
declare i64 @llvm.amdgcn.wave.reduce.or.i64(i64, i32 immarg)
define amdgpu_kernel void @test_constant_fold_wave_reduce_or_i32_poison(ptr addrspace(1) %out, i32 %in) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_or_i32_poison(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i32 poison, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 poison, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_or_i32_const(ptr addrspace(1) %out) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_or_i32_const(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i32 123, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 123, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_or_i64_poison(ptr addrspace(1) %out, i64 %in) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_or_i64_poison(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i64 poison, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i64 @llvm.amdgcn.wave.reduce.or.i64(i64 poison, i32 1)
store i64 %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_or_i64_const(ptr addrspace(1) %out) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_or_i64_const(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i64 123, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i64 @llvm.amdgcn.wave.reduce.or.i64(i64 123, i32 1)
store i64 %result, ptr addrspace(1) %out
ret void
}
; --------------------------------------------------------------------
; llvm.amdgcn.wave.reduce.xor
; --------------------------------------------------------------------
declare i32 @llvm.amdgcn.wave.reduce.xor.i32(i32, i32 immarg)
declare i64 @llvm.amdgcn.wave.reduce.xor.i64(i64, i32 immarg)
define amdgpu_kernel void @test_constant_fold_wave_reduce_xor_i32_poison(ptr addrspace(1) %out, i32 %in) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_xor_i32_poison(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i32 poison, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 poison, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_xor_i32_const(ptr addrspace(1) %out) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_xor_i32_const(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i32 123, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 123, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_xor_i64_poison(ptr addrspace(1) %out, i64 %in) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_xor_i64_poison(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i64 poison, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i64 @llvm.amdgcn.wave.reduce.xor.i64(i64 poison, i32 1)
store i64 %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_constant_fold_wave_reduce_xor_i64_const(ptr addrspace(1) %out) {
; CHECK-LABEL: @test_constant_fold_wave_reduce_xor_i64_const(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i64 123, ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%result = call i64 @llvm.amdgcn.wave.reduce.xor.i64(i64 123, i32 1)
store i64 %result, ptr addrspace(1) %out
ret void
}