[AMDGPU] Add intrinsics for v_[pk]_add_{min|max}_* instructions (#164731)
This commit is contained in:
parent
d162025d8b
commit
9b5bc98743
@ -830,6 +830,15 @@ TARGET_BUILTIN(__builtin_amdgcn_perm_pk16_b4_u4, "V2UiUiUiV2Ui", "nc", "tensor-c
|
||||
TARGET_BUILTIN(__builtin_amdgcn_perm_pk16_b6_u4, "V3UiUiULiV2Ui", "nc", "tensor-cvt-lut-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_perm_pk16_b8_u4, "V4UiULiULiV2Ui", "nc", "tensor-cvt-lut-insts")
|
||||
|
||||
TARGET_BUILTIN(__builtin_amdgcn_add_max_i32, "iiiiIb", "nc", "add-min-max-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_add_max_u32, "UiUiUiUiIb", "nc", "add-min-max-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_add_min_i32, "iiiiIb", "nc", "add-min-max-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_add_min_u32, "UiUiUiUiIb", "nc", "add-min-max-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_pk_add_max_i16, "V2sV2sV2sV2sIb", "nc", "pk-add-min-max-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_pk_add_max_u16, "V2UsV2UsV2UsV2UsIb", "nc", "pk-add-min-max-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_pk_add_min_i16, "V2sV2sV2sV2sIb", "nc", "pk-add-min-max-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_pk_add_min_u16, "V2UsV2UsV2UsV2UsIb", "nc", "pk-add-min-max-insts")
|
||||
|
||||
// GFX1250 WMMA builtins
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x4_f32, "V8fIbV2fIbV2fIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_bf16, "V8fIbV16yIbV16yIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
|
||||
|
||||
@ -26,8 +26,8 @@ kernel void foo(global int *p) { *p = 1; }
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
//.
|
||||
// CHECK: attributes #[[ATTR0]] = { convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" "uniform-work-group-size"="false" }
|
||||
// CHECK: attributes #[[ATTR1]] = { alwaysinline convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" }
|
||||
// CHECK: attributes #[[ATTR0]] = { convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" "uniform-work-group-size"="false" }
|
||||
// CHECK: attributes #[[ATTR1]] = { alwaysinline convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" }
|
||||
// CHECK: attributes #[[ATTR2]] = { convergent nounwind }
|
||||
//.
|
||||
// CHECK: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
|
||||
|
||||
@ -109,8 +109,8 @@
|
||||
// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
|
||||
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
|
||||
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
|
||||
// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
|
||||
// GFX1251: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
|
||||
// GFX1250: "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
|
||||
// GFX1251: "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
|
||||
|
||||
// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
|
||||
|
||||
|
||||
@ -21,6 +21,7 @@ typedef float __attribute__((ext_vector_type(8))) float8;
|
||||
typedef float __attribute__((ext_vector_type(16))) float16;
|
||||
typedef float __attribute__((ext_vector_type(32))) float32;
|
||||
typedef short __attribute__((ext_vector_type(2))) short2;
|
||||
typedef unsigned short __attribute__((ext_vector_type(2))) ushort2;
|
||||
|
||||
// CHECK-LABEL: @test_setprio_inc_wg(
|
||||
// CHECK-NEXT: entry:
|
||||
@ -1718,3 +1719,111 @@ void test_cvt_f32_fp8_e5m3(global int* out, int a)
|
||||
*out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, 2);
|
||||
*out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, 3);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @test_add_min_max(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
|
||||
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
|
||||
// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
|
||||
// CHECK-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
|
||||
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
|
||||
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
|
||||
// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
|
||||
// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
|
||||
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
|
||||
// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: store i32 [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.add.max.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i1 false)
|
||||
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
|
||||
// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
|
||||
// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.add.max.u32(i32 [[TMP5]], i32 [[TMP6]], i32 [[TMP7]], i1 true)
|
||||
// CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
|
||||
// CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) [[TMP9]], align 4
|
||||
// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.add.min.i32(i32 [[TMP10]], i32 [[TMP11]], i32 [[TMP12]], i1 false)
|
||||
// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
|
||||
// CHECK-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[TMP14]], align 4
|
||||
// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.add.min.u32(i32 [[TMP15]], i32 [[TMP16]], i32 [[TMP17]], i1 true)
|
||||
// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
|
||||
// CHECK-NEXT: store i32 [[TMP18]], ptr addrspace(1) [[TMP19]], align 4
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_add_min_max(global int *out, int a, int b, int c)
|
||||
{
|
||||
*out = __builtin_amdgcn_add_max_i32(a, b, c, false);
|
||||
*out = __builtin_amdgcn_add_max_u32(a, b, c, true);
|
||||
*out = __builtin_amdgcn_add_min_i32(a, b, c, false);
|
||||
*out = __builtin_amdgcn_add_min_u32(a, b, c, true);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @test_pk_add_min_max(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
|
||||
// CHECK-NEXT: [[UOUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
|
||||
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
|
||||
// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
|
||||
// CHECK-NEXT: [[C_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
|
||||
// CHECK-NEXT: [[UA_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
|
||||
// CHECK-NEXT: [[UB_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
|
||||
// CHECK-NEXT: [[UC_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
|
||||
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
|
||||
// CHECK-NEXT: [[UOUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UOUT_ADDR]] to ptr
|
||||
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
|
||||
// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
|
||||
// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
|
||||
// CHECK-NEXT: [[UA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UA_ADDR]] to ptr
|
||||
// CHECK-NEXT: [[UB_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UB_ADDR]] to ptr
|
||||
// CHECK-NEXT: [[UC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UC_ADDR]] to ptr
|
||||
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
|
||||
// CHECK-NEXT: store ptr addrspace(1) [[UOUT:%.*]], ptr [[UOUT_ADDR_ASCAST]], align 8
|
||||
// CHECK-NEXT: store <2 x i16> [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: store <2 x i16> [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: store <2 x i16> [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: store <2 x i16> [[UA:%.*]], ptr [[UA_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: store <2 x i16> [[UB:%.*]], ptr [[UB_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: store <2 x i16> [[UC:%.*]], ptr [[UC_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i16>, ptr [[A_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[B_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i16>, ptr [[C_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP3:%.*]] = call <2 x i16> @llvm.amdgcn.pk.add.max.i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]], <2 x i16> [[TMP2]], i1 false)
|
||||
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
|
||||
// CHECK-NEXT: store <2 x i16> [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
|
||||
// CHECK-NEXT: [[TMP5:%.*]] = load <2 x i16>, ptr [[UA_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP6:%.*]] = load <2 x i16>, ptr [[UB_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr [[UC_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP8:%.*]] = call <2 x i16> @llvm.amdgcn.pk.add.max.u16(<2 x i16> [[TMP5]], <2 x i16> [[TMP6]], <2 x i16> [[TMP7]], i1 true)
|
||||
// CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[UOUT_ADDR_ASCAST]], align 8
|
||||
// CHECK-NEXT: store <2 x i16> [[TMP8]], ptr addrspace(1) [[TMP9]], align 4
|
||||
// CHECK-NEXT: [[TMP10:%.*]] = load <2 x i16>, ptr [[A_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP11:%.*]] = load <2 x i16>, ptr [[B_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP12:%.*]] = load <2 x i16>, ptr [[C_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP13:%.*]] = call <2 x i16> @llvm.amdgcn.pk.add.min.i16(<2 x i16> [[TMP10]], <2 x i16> [[TMP11]], <2 x i16> [[TMP12]], i1 false)
|
||||
// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
|
||||
// CHECK-NEXT: store <2 x i16> [[TMP13]], ptr addrspace(1) [[TMP14]], align 4
|
||||
// CHECK-NEXT: [[TMP15:%.*]] = load <2 x i16>, ptr [[UA_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP16:%.*]] = load <2 x i16>, ptr [[UB_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP17:%.*]] = load <2 x i16>, ptr [[UC_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP18:%.*]] = call <2 x i16> @llvm.amdgcn.pk.add.min.u16(<2 x i16> [[TMP15]], <2 x i16> [[TMP16]], <2 x i16> [[TMP17]], i1 true)
|
||||
// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[UOUT_ADDR_ASCAST]], align 8
|
||||
// CHECK-NEXT: store <2 x i16> [[TMP18]], ptr addrspace(1) [[TMP19]], align 4
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_pk_add_min_max(global short2 *out, global ushort2 *uout, short2 a, short2 b, short2 c, ushort2 ua, ushort2 ub, ushort2 uc)
|
||||
{
|
||||
*out = __builtin_amdgcn_pk_add_max_i16(a, b, c, false);
|
||||
*uout = __builtin_amdgcn_pk_add_max_u16(ua, ub, uc, true);
|
||||
*out = __builtin_amdgcn_pk_add_min_i16(a, b, c, false);
|
||||
*uout = __builtin_amdgcn_pk_add_min_u16(ua, ub, uc, true);
|
||||
}
|
||||
|
||||
@ -15,6 +15,8 @@ typedef half __attribute__((ext_vector_type(32))) half32;
|
||||
typedef float __attribute__((ext_vector_type(8))) float8;
|
||||
typedef float __attribute__((ext_vector_type(16))) float16;
|
||||
typedef float __attribute__((ext_vector_type(32))) float32;
|
||||
typedef short __attribute__((ext_vector_type(2))) short2;
|
||||
typedef unsigned short __attribute__((ext_vector_type(2))) ushort2;
|
||||
|
||||
typedef int v4i __attribute__((ext_vector_type(4)));
|
||||
typedef int v8i __attribute__((ext_vector_type(8)));
|
||||
@ -165,3 +167,19 @@ void test_cvt_f32_fp8_e5m3(global int* out, int a)
|
||||
{
|
||||
*out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, a); // expected-error {{'__builtin_amdgcn_cvt_f32_fp8_e5m3' must be a constant integer}}
|
||||
}
|
||||
|
||||
void test_add_min_max(global int *out, int a, int b, int c, bool clamp)
|
||||
{
|
||||
*out = __builtin_amdgcn_add_max_i32(a, b, c, clamp); // expected-error {{'__builtin_amdgcn_add_max_i32' must be a constant integer}}
|
||||
*out = __builtin_amdgcn_add_max_u32(a, b, c, clamp); // expected-error {{'__builtin_amdgcn_add_max_u32' must be a constant integer}}
|
||||
*out = __builtin_amdgcn_add_min_i32(a, b, c, clamp); // expected-error {{'__builtin_amdgcn_add_min_i32' must be a constant integer}}
|
||||
*out = __builtin_amdgcn_add_min_u32(a, b, c, clamp); // expected-error {{'__builtin_amdgcn_add_min_u32' must be a constant integer}}
|
||||
}
|
||||
|
||||
void test_pk_add_min_max(global short2 *out, global ushort2 *uout, short2 a, short2 b, short2 c, ushort2 ua, ushort2 ub, ushort2 uc, bool clamp)
|
||||
{
|
||||
*out = __builtin_amdgcn_pk_add_max_i16(a, b, c, clamp); // expected-error {{'__builtin_amdgcn_pk_add_max_i16' must be a constant integer}}
|
||||
*uout = __builtin_amdgcn_pk_add_max_u16(ua, ub, uc, clamp); // expected-error {{'__builtin_amdgcn_pk_add_max_u16' must be a constant integer}}
|
||||
*out = __builtin_amdgcn_pk_add_min_i16(a, b, c, clamp); // expected-error {{'__builtin_amdgcn_pk_add_min_i16' must be a constant integer}}
|
||||
*uout = __builtin_amdgcn_pk_add_min_u16(ua, ub, uc, clamp); // expected-error {{'__builtin_amdgcn_pk_add_min_u16' must be a constant integer}}
|
||||
}
|
||||
|
||||
@ -3,9 +3,21 @@
|
||||
|
||||
typedef unsigned int uint;
|
||||
typedef unsigned short int ushort;
|
||||
typedef short __attribute__((ext_vector_type(2))) short2;
|
||||
typedef unsigned short __attribute__((ext_vector_type(2))) ushort2;
|
||||
|
||||
void test(global uint* out, uint a, uint b, uint c) {
|
||||
void test(global int* out, global short2 *s2out, global ushort2 *us2out,
|
||||
uint a, uint b, uint c, short2 s2a, short2 s2b, short2 s2c,
|
||||
ushort2 us2a, ushort2 us2b, ushort2 us2c) {
|
||||
__builtin_amdgcn_s_setprio_inc_wg(1); // expected-error {{'__builtin_amdgcn_s_setprio_inc_wg' needs target feature setprio-inc-wg-inst}}
|
||||
*out = __builtin_amdgcn_bitop3_b32(a, b, c, 1); // expected-error {{'__builtin_amdgcn_bitop3_b32' needs target feature bitop3-insts}}
|
||||
*out = __builtin_amdgcn_bitop3_b16((ushort)a, (ushort)b, (ushort)c, 1); // expected-error {{'__builtin_amdgcn_bitop3_b16' needs target feature bitop3-insts}}
|
||||
*out = __builtin_amdgcn_add_max_i32(a, b, c, false); // expected-error {{'__builtin_amdgcn_add_max_i32' needs target feature add-min-max-insts}}
|
||||
*out = __builtin_amdgcn_add_max_u32(a, b, c, true); // expected-error {{'__builtin_amdgcn_add_max_u32' needs target feature add-min-max-insts}}
|
||||
*out = __builtin_amdgcn_add_min_i32(a, b, c, false); // expected-error {{'__builtin_amdgcn_add_min_i32' needs target feature add-min-max-insts}}
|
||||
*out = __builtin_amdgcn_add_min_u32(a, b, c, true); // expected-error {{'__builtin_amdgcn_add_min_u32' needs target feature add-min-max-insts}}
|
||||
*s2out = __builtin_amdgcn_pk_add_max_i16(s2a, s2b, s2c, false); // expected-error {{'__builtin_amdgcn_pk_add_max_i16' needs target feature pk-add-min-max-insts}}
|
||||
*us2out = __builtin_amdgcn_pk_add_max_u16(us2a, us2b, us2c, true); // expected-error {{'__builtin_amdgcn_pk_add_max_u16' needs target feature pk-add-min-max-insts}}
|
||||
*s2out = __builtin_amdgcn_pk_add_min_i16(s2a, s2b, s2c, false); // expected-error {{'__builtin_amdgcn_pk_add_min_i16' needs target feature pk-add-min-max-insts}}
|
||||
*us2out = __builtin_amdgcn_pk_add_min_u16(us2a, us2b, us2c, true); // expected-error {{'__builtin_amdgcn_pk_add_min_u16' needs target feature pk-add-min-max-insts}}
|
||||
}
|
||||
|
||||
@ -3789,6 +3789,20 @@ def int_amdgcn_perm_pk16_b8_u4 : ClangBuiltin<"__builtin_amdgcn_perm_pk16_b8_u4"
|
||||
DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_v2i32_ty],
|
||||
[IntrNoMem, IntrSpeculatable]>;
|
||||
|
||||
class AMDGPUAddMinMax<LLVMType Ty, string Name> : ClangBuiltin<"__builtin_amdgcn_"#Name>,
|
||||
DefaultAttrsIntrinsic<[Ty], [Ty, Ty, Ty, llvm_i1_ty /* clamp */],
|
||||
[IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]>;
|
||||
|
||||
def int_amdgcn_add_max_i32 : AMDGPUAddMinMax<llvm_i32_ty, "add_max_i32">;
|
||||
def int_amdgcn_add_max_u32 : AMDGPUAddMinMax<llvm_i32_ty, "add_max_u32">;
|
||||
def int_amdgcn_add_min_i32 : AMDGPUAddMinMax<llvm_i32_ty, "add_min_i32">;
|
||||
def int_amdgcn_add_min_u32 : AMDGPUAddMinMax<llvm_i32_ty, "add_min_u32">;
|
||||
|
||||
def int_amdgcn_pk_add_max_i16 : AMDGPUAddMinMax<llvm_v2i16_ty, "pk_add_max_i16">;
|
||||
def int_amdgcn_pk_add_max_u16 : AMDGPUAddMinMax<llvm_v2i16_ty, "pk_add_max_u16">;
|
||||
def int_amdgcn_pk_add_min_i16 : AMDGPUAddMinMax<llvm_v2i16_ty, "pk_add_min_i16">;
|
||||
def int_amdgcn_pk_add_min_u16 : AMDGPUAddMinMax<llvm_v2i16_ty, "pk_add_min_u16">;
|
||||
|
||||
class AMDGPUCooperativeAtomicStore<LLVMType Ty> : Intrinsic <
|
||||
[],
|
||||
[llvm_anyptr_ty, // pointer to store to
|
||||
|
||||
@ -1430,6 +1430,18 @@ def FeatureAddSubU64Insts
|
||||
def FeatureMadU32Inst : SubtargetFeature<"mad-u32-inst", "HasMadU32Inst",
|
||||
"true", "Has v_mad_u32 instruction">;
|
||||
|
||||
def FeatureAddMinMaxInsts : SubtargetFeature<"add-min-max-insts",
|
||||
"HasAddMinMaxInsts",
|
||||
"true",
|
||||
"Has v_add_{min|max}_{i|u}32 instructions"
|
||||
>;
|
||||
|
||||
def FeaturePkAddMinMaxInsts : SubtargetFeature<"pk-add-min-max-insts",
|
||||
"HasPkAddMinMaxInsts",
|
||||
"true",
|
||||
"Has v_pk_add_{min|max}_{i|u}16 instructions"
|
||||
>;
|
||||
|
||||
def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts",
|
||||
"HasVMemToLDSLoad",
|
||||
"true",
|
||||
@ -2115,6 +2127,8 @@ def FeatureISAVersion12_50 : FeatureSet<
|
||||
FeatureLshlAddU64Inst,
|
||||
FeatureAddSubU64Insts,
|
||||
FeatureMadU32Inst,
|
||||
FeatureAddMinMaxInsts,
|
||||
FeaturePkAddMinMaxInsts,
|
||||
FeatureLdsBarrierArriveAtomic,
|
||||
FeatureSetPrioIncWgInst,
|
||||
Feature45BitNumRecordsBufferResource,
|
||||
@ -2658,11 +2672,11 @@ def HasFmaakFmamkF64Insts :
|
||||
|
||||
def HasAddMinMaxInsts :
|
||||
Predicate<"Subtarget->hasAddMinMaxInsts()">,
|
||||
AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
|
||||
AssemblerPredicate<(any_of FeatureAddMinMaxInsts)>;
|
||||
|
||||
def HasPkAddMinMaxInsts :
|
||||
Predicate<"Subtarget->hasPkAddMinMaxInsts()">,
|
||||
AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
|
||||
AssemblerPredicate<(any_of FeaturePkAddMinMaxInsts)>;
|
||||
|
||||
def HasPkMinMax3Insts :
|
||||
Predicate<"Subtarget->hasPkMinMax3Insts()">,
|
||||
|
||||
@ -4835,6 +4835,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
|
||||
case Intrinsic::amdgcn_perm_pk16_b4_u4:
|
||||
case Intrinsic::amdgcn_perm_pk16_b6_u4:
|
||||
case Intrinsic::amdgcn_perm_pk16_b8_u4:
|
||||
case Intrinsic::amdgcn_add_max_i32:
|
||||
case Intrinsic::amdgcn_add_max_u32:
|
||||
case Intrinsic::amdgcn_add_min_i32:
|
||||
case Intrinsic::amdgcn_add_min_u32:
|
||||
case Intrinsic::amdgcn_pk_add_max_i16:
|
||||
case Intrinsic::amdgcn_pk_add_max_u16:
|
||||
case Intrinsic::amdgcn_pk_add_min_i16:
|
||||
case Intrinsic::amdgcn_pk_add_min_u16:
|
||||
return getDefaultMappingVOP(MI);
|
||||
case Intrinsic::amdgcn_log:
|
||||
case Intrinsic::amdgcn_exp2:
|
||||
|
||||
@ -277,6 +277,8 @@ protected:
|
||||
bool HasLshlAddU64Inst = false;
|
||||
bool HasAddSubU64Insts = false;
|
||||
bool HasMadU32Inst = false;
|
||||
bool HasAddMinMaxInsts = false;
|
||||
bool HasPkAddMinMaxInsts = false;
|
||||
bool HasPointSampleAccel = false;
|
||||
bool HasLdsBarrierArriveAtomic = false;
|
||||
bool HasSetPrioIncWgInst = false;
|
||||
@ -1567,10 +1569,10 @@ public:
|
||||
bool hasIntMinMax64() const { return GFX1250Insts; }
|
||||
|
||||
// \returns true if the target has V_ADD_{MIN|MAX}_{I|U}32 instructions.
|
||||
bool hasAddMinMaxInsts() const { return GFX1250Insts; }
|
||||
bool hasAddMinMaxInsts() const { return HasAddMinMaxInsts; }
|
||||
|
||||
// \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
|
||||
bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }
|
||||
bool hasPkAddMinMaxInsts() const { return HasPkAddMinMaxInsts; }
|
||||
|
||||
// \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
|
||||
bool hasPkMinMax3Insts() const { return GFX1250Insts; }
|
||||
|
||||
@ -775,10 +775,10 @@ let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in {
|
||||
} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
|
||||
|
||||
let SubtargetPredicate = HasAddMinMaxInsts, isCommutable = 1, isReMaterializable = 1 in {
|
||||
defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
|
||||
defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
|
||||
defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
|
||||
defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
|
||||
defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_max_i32>;
|
||||
defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_max_u32>;
|
||||
defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_min_i32>;
|
||||
defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_min_u32>;
|
||||
}
|
||||
|
||||
defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>;
|
||||
|
||||
@ -75,7 +75,7 @@ multiclass VOP3PInst<string OpName, VOPProfile P,
|
||||
SDPatternOperator node = null_frag, bit IsDOT = 0> {
|
||||
def NAME : VOP3P_Pseudo<OpName, P,
|
||||
!if (P.HasModifiers,
|
||||
getVOP3PModPat<P, node, IsDOT, IsDOT>.ret,
|
||||
getVOP3PModPat<P, node, !or(P.EnableClamp, IsDOT), IsDOT>.ret,
|
||||
getVOP3Pat<P, node>.ret)>;
|
||||
let SubtargetPredicate = isGFX11Plus in {
|
||||
if P.HasExtVOP3DPP then
|
||||
@ -434,15 +434,16 @@ defm : MadFmaMixFP16Pats_t16<fma, V_FMA_MIX_BF16_t16>;
|
||||
} // End SubtargetPredicate = HasFmaMixBF16Insts
|
||||
|
||||
def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> {
|
||||
let HasModifiers = 0;
|
||||
let HasNeg = 0;
|
||||
let EnableClamp = 1;
|
||||
}
|
||||
|
||||
let isCommutable = 1, isReMaterializable = 1 in {
|
||||
let SubtargetPredicate = HasPkAddMinMaxInsts in {
|
||||
defm V_PK_ADD_MAX_I16 : VOP3PInst<"v_pk_add_max_i16", PK_ADD_MINMAX_Profile>;
|
||||
defm V_PK_ADD_MAX_U16 : VOP3PInst<"v_pk_add_max_u16", PK_ADD_MINMAX_Profile>;
|
||||
defm V_PK_ADD_MIN_I16 : VOP3PInst<"v_pk_add_min_i16", PK_ADD_MINMAX_Profile>;
|
||||
defm V_PK_ADD_MIN_U16 : VOP3PInst<"v_pk_add_min_u16", PK_ADD_MINMAX_Profile>;
|
||||
defm V_PK_ADD_MAX_I16 : VOP3PInst<"v_pk_add_max_i16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_max_i16>;
|
||||
defm V_PK_ADD_MAX_U16 : VOP3PInst<"v_pk_add_max_u16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_max_u16>;
|
||||
defm V_PK_ADD_MIN_I16 : VOP3PInst<"v_pk_add_min_i16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_min_i16>;
|
||||
defm V_PK_ADD_MIN_U16 : VOP3PInst<"v_pk_add_min_u16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_min_u16>;
|
||||
}
|
||||
let SubtargetPredicate = HasPkMinMax3Insts in {
|
||||
defm V_PK_MAX3_I16 : VOP3PInst<"v_pk_max3_i16", PK_ADD_MINMAX_Profile>;
|
||||
|
||||
@ -433,6 +433,8 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
|
||||
Features["fp8e5m3-insts"] = true;
|
||||
Features["permlane16-swap"] = true;
|
||||
Features["ashr-pk-insts"] = true;
|
||||
Features["add-min-max-insts"] = true;
|
||||
Features["pk-add-min-max-insts"] = true;
|
||||
Features["atomic-buffer-pk-add-bf16-inst"] = true;
|
||||
Features["vmem-pref-insts"] = true;
|
||||
Features["atomic-fadd-rtn-insts"] = true;
|
||||
|
||||
191
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.add.min.max.ll
Normal file
191
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.add.min.max.ll
Normal file
@ -0,0 +1,191 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
|
||||
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250-SDAG %s
|
||||
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250-GISEL %s
|
||||
|
||||
declare i32 @llvm.amdgcn.add.min.i32(i32, i32, i32, i1)
|
||||
declare i32 @llvm.amdgcn.add.max.i32(i32, i32, i32, i1)
|
||||
declare i32 @llvm.amdgcn.add.min.u32(i32, i32, i32, i1)
|
||||
declare i32 @llvm.amdgcn.add.max.u32(i32, i32, i32, i1)
|
||||
declare <2 x i16> @llvm.amdgcn.pk.add.min.i16(<2 x i16>, <2 x i16>, <2 x i16>, i1)
|
||||
declare <2 x i16> @llvm.amdgcn.pk.add.max.i16(<2 x i16>, <2 x i16>, <2 x i16>, i1)
|
||||
declare <2 x i16> @llvm.amdgcn.pk.add.min.u16(<2 x i16>, <2 x i16>, <2 x i16>, i1)
|
||||
declare <2 x i16> @llvm.amdgcn.pk.add.max.u16(<2 x i16>, <2 x i16>, <2 x i16>, i1)
|
||||
|
||||
define i32 @test_add_min_i32_vvv(i32 %a, i32 %b, i32 %c) {
|
||||
; GCN-LABEL: test_add_min_i32_vvv:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GCN-NEXT: s_wait_kmcnt 0x0
|
||||
; GCN-NEXT: v_add_min_i32 v0, v0, v1, v2
|
||||
; GCN-NEXT: s_set_pc_i64 s[30:31]
|
||||
%ret = tail call i32 @llvm.amdgcn.add.min.i32(i32 %a, i32 %b, i32 %c, i1 0)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_add_min_i32_ssi_clamp(i32 inreg %a, i32 inreg %b) {
|
||||
; GCN-LABEL: test_add_min_i32_ssi_clamp:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GCN-NEXT: s_wait_kmcnt 0x0
|
||||
; GCN-NEXT: v_add_min_i32 v0, s0, s1, 1 clamp
|
||||
; GCN-NEXT: s_set_pc_i64 s[30:31]
|
||||
%ret = tail call i32 @llvm.amdgcn.add.min.i32(i32 %a, i32 %b, i32 1, i1 1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_add_min_u32_vvv(i32 %a, i32 %b, i32 %c) {
|
||||
; GCN-LABEL: test_add_min_u32_vvv:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GCN-NEXT: s_wait_kmcnt 0x0
|
||||
; GCN-NEXT: v_add_min_u32 v0, v0, v1, v2
|
||||
; GCN-NEXT: s_set_pc_i64 s[30:31]
|
||||
%ret = tail call i32 @llvm.amdgcn.add.min.u32(i32 %a, i32 %b, i32 %c, i1 0)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_add_min_u32_ssi_clamp(i32 inreg %a, i32 inreg %b) {
|
||||
; GCN-LABEL: test_add_min_u32_ssi_clamp:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GCN-NEXT: s_wait_kmcnt 0x0
|
||||
; GCN-NEXT: v_add_min_u32 v0, s0, s1, 1 clamp
|
||||
; GCN-NEXT: s_set_pc_i64 s[30:31]
|
||||
%ret = tail call i32 @llvm.amdgcn.add.min.u32(i32 %a, i32 %b, i32 1, i1 1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_add_max_i32_vvv(i32 %a, i32 %b, i32 %c) {
|
||||
; GCN-LABEL: test_add_max_i32_vvv:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GCN-NEXT: s_wait_kmcnt 0x0
|
||||
; GCN-NEXT: v_add_max_i32 v0, v0, v1, v2
|
||||
; GCN-NEXT: s_set_pc_i64 s[30:31]
|
||||
%ret = tail call i32 @llvm.amdgcn.add.max.i32(i32 %a, i32 %b, i32 %c, i1 0)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_add_max_i32_ssi_clamp(i32 inreg %a, i32 inreg %b) {
|
||||
; GCN-LABEL: test_add_max_i32_ssi_clamp:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GCN-NEXT: s_wait_kmcnt 0x0
|
||||
; GCN-NEXT: v_add_max_i32 v0, s0, s1, 1 clamp
|
||||
; GCN-NEXT: s_set_pc_i64 s[30:31]
|
||||
%ret = tail call i32 @llvm.amdgcn.add.max.i32(i32 %a, i32 %b, i32 1, i1 1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_add_max_u32_vvv(i32 %a, i32 %b, i32 %c) {
|
||||
; GCN-LABEL: test_add_max_u32_vvv:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GCN-NEXT: s_wait_kmcnt 0x0
|
||||
; GCN-NEXT: v_add_max_u32 v0, v0, v1, v2
|
||||
; GCN-NEXT: s_set_pc_i64 s[30:31]
|
||||
%ret = tail call i32 @llvm.amdgcn.add.max.u32(i32 %a, i32 %b, i32 %c, i1 0)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_add_max_u32_ssi_clamp(i32 inreg %a, i32 inreg %b) {
|
||||
; GCN-LABEL: test_add_max_u32_ssi_clamp:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GCN-NEXT: s_wait_kmcnt 0x0
|
||||
; GCN-NEXT: v_add_max_u32 v0, s0, s1, 1 clamp
|
||||
; GCN-NEXT: s_set_pc_i64 s[30:31]
|
||||
%ret = tail call i32 @llvm.amdgcn.add.max.u32(i32 %a, i32 %b, i32 1, i1 1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define <2 x i16> @test_add_min_i16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
|
||||
; GCN-LABEL: test_add_min_i16_vvv:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GCN-NEXT: s_wait_kmcnt 0x0
|
||||
; GCN-NEXT: v_pk_add_min_i16 v0, v0, v1, v2
|
||||
; GCN-NEXT: s_set_pc_i64 s[30:31]
|
||||
%ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0)
|
||||
ret <2 x i16> %ret
|
||||
}
|
||||
|
||||
define <2 x i16> @test_add_min_i16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) {
|
||||
; GCN-LABEL: test_add_min_i16_ssi_clamp:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GCN-NEXT: s_wait_kmcnt 0x0
|
||||
; GCN-NEXT: v_pk_add_min_i16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp
|
||||
; GCN-NEXT: s_set_pc_i64 s[30:31]
|
||||
%ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1)
|
||||
ret <2 x i16> %ret
|
||||
}
|
||||
|
||||
define <2 x i16> @test_add_min_u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
|
||||
; GCN-LABEL: test_add_min_u16_vvv:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GCN-NEXT: s_wait_kmcnt 0x0
|
||||
; GCN-NEXT: v_pk_add_min_u16 v0, v0, v1, v2
|
||||
; GCN-NEXT: s_set_pc_i64 s[30:31]
|
||||
%ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0)
|
||||
ret <2 x i16> %ret
|
||||
}
|
||||
|
||||
define <2 x i16> @test_add_min_u16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) {
|
||||
; GCN-LABEL: test_add_min_u16_ssi_clamp:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GCN-NEXT: s_wait_kmcnt 0x0
|
||||
; GCN-NEXT: v_pk_add_min_u16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp
|
||||
; GCN-NEXT: s_set_pc_i64 s[30:31]
|
||||
%ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1)
|
||||
ret <2 x i16> %ret
|
||||
}
|
||||
|
||||
define <2 x i16> @test_add_max_i16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
|
||||
; GCN-LABEL: test_add_max_i16_vvv:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GCN-NEXT: s_wait_kmcnt 0x0
|
||||
; GCN-NEXT: v_pk_add_max_i16 v0, v0, v1, v2
|
||||
; GCN-NEXT: s_set_pc_i64 s[30:31]
|
||||
%ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0)
|
||||
ret <2 x i16> %ret
|
||||
}
|
||||
|
||||
define <2 x i16> @test_add_max_i16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) {
|
||||
; GCN-LABEL: test_add_max_i16_ssi_clamp:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GCN-NEXT: s_wait_kmcnt 0x0
|
||||
; GCN-NEXT: v_pk_add_max_i16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp
|
||||
; GCN-NEXT: s_set_pc_i64 s[30:31]
|
||||
%ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1)
|
||||
ret <2 x i16> %ret
|
||||
}
|
||||
|
||||
define <2 x i16> @test_add_max_u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
|
||||
; GCN-LABEL: test_add_max_u16_vvv:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GCN-NEXT: s_wait_kmcnt 0x0
|
||||
; GCN-NEXT: v_pk_add_max_u16 v0, v0, v1, v2
|
||||
; GCN-NEXT: s_set_pc_i64 s[30:31]
|
||||
%ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0)
|
||||
ret <2 x i16> %ret
|
||||
}
|
||||
|
||||
define <2 x i16> @test_add_max_u16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) {
|
||||
; GCN-LABEL: test_add_max_u16_ssi_clamp:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GCN-NEXT: s_wait_kmcnt 0x0
|
||||
; GCN-NEXT: v_pk_add_max_u16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp
|
||||
; GCN-NEXT: s_set_pc_i64 s[30:31]
|
||||
%ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1)
|
||||
ret <2 x i16> %ret
|
||||
}
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; GFX1250-GISEL: {{.*}}
|
||||
; GFX1250-SDAG: {{.*}}
|
||||
Loading…
x
Reference in New Issue
Block a user