[AMDGPU] True16 support for bf16 clamp pattern on gfx1250 (#190036)

This commit is contained in:
Stanislav Mekhanoshin 2026-04-01 14:26:42 -07:00 committed by GitHub
parent c6669c4993
commit a9df7c7186
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 183 additions and 56 deletions

View File

@ -2044,9 +2044,17 @@ let SubtargetPredicate = UseRealTrue16Insts in
def : ClampPat<V_MAX_F16_t16_e64, f16>;
let SubtargetPredicate = UseFakeTrue16Insts in
def : ClampPat<V_MAX_F16_fake16_e64, f16>;
// FIXME-TRUE16: Pseudo expansion of this won't work with True16.
let True16Predicate = UseFakeTrue16Insts in
def : ClampPat<V_MAX_BF16_PSEUDO_e64, bf16>;
let True16Predicate = UseRealTrue16Insts,
SubtargetPredicate = HasBF16PackedInsts in
def : GCNPat<(bf16 (AMDGPUclamp (VOP3Mods bf16:$src0, i32:$src0_modifiers))),
(EXTRACT_SUBREG
(V_PK_MAX_NUM_BF16
$src0_modifiers, (REG_SEQUENCE VGPR_32, $src0, lo16, (bf16 (IMPLICIT_DEF)), hi16),
$src0_modifiers, (REG_SEQUENCE VGPR_32, $src0, lo16, (bf16 (IMPLICIT_DEF)), hi16),
DSTCLAMP.ENABLE), lo16)
>;
let SubtargetPredicate = HasVOP3PInsts in {
def : GCNPat <

View File

@ -1,26 +1,41 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GCN,TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GCN,FAKE16 %s
; TODO: Add global-isel when it can support bf16
define amdgpu_ps void @llvm_sqrt_bf16_v(ptr addrspace(1) %out, bfloat %src) {
; GCN-LABEL: llvm_sqrt_bf16_v:
; GCN: ; %bb.0:
; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GCN-NEXT: v_sqrt_bf16_e32 v2, v2
; GCN-NEXT: global_store_b16 v[0:1], v2, off
; GCN-NEXT: s_endpgm
; TRUE16-LABEL: llvm_sqrt_bf16_v:
; TRUE16: ; %bb.0:
; TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; TRUE16-NEXT: v_sqrt_bf16_e32 v2.l, v2.l
; TRUE16-NEXT: global_store_b16 v[0:1], v2, off
; TRUE16-NEXT: s_endpgm
;
; FAKE16-LABEL: llvm_sqrt_bf16_v:
; FAKE16: ; %bb.0:
; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; FAKE16-NEXT: v_sqrt_bf16_e32 v2, v2
; FAKE16-NEXT: global_store_b16 v[0:1], v2, off
; FAKE16-NEXT: s_endpgm
%sqrt = call bfloat @llvm.sqrt.bf16(bfloat %src)
store bfloat %sqrt, ptr addrspace(1) %out, align 2
ret void
}
define amdgpu_ps void @llvm_sqrt_bf16_s(ptr addrspace(1) %out, bfloat inreg %src) {
; GCN-LABEL: llvm_sqrt_bf16_s:
; GCN: ; %bb.0:
; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GCN-NEXT: v_sqrt_bf16_e32 v2, s0
; GCN-NEXT: global_store_b16 v[0:1], v2, off
; GCN-NEXT: s_endpgm
; TRUE16-LABEL: llvm_sqrt_bf16_s:
; TRUE16: ; %bb.0:
; TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; TRUE16-NEXT: v_sqrt_bf16_e32 v2.l, s0
; TRUE16-NEXT: global_store_b16 v[0:1], v2, off
; TRUE16-NEXT: s_endpgm
;
; FAKE16-LABEL: llvm_sqrt_bf16_s:
; FAKE16: ; %bb.0:
; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; FAKE16-NEXT: v_sqrt_bf16_e32 v2, s0
; FAKE16-NEXT: global_store_b16 v[0:1], v2, off
; FAKE16-NEXT: s_endpgm
%sqrt = call bfloat @llvm.sqrt.bf16(bfloat %src)
store bfloat %sqrt, ptr addrspace(1) %out, align 2
ret void
@ -353,27 +368,94 @@ define amdgpu_ps void @v_test_max_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat>
}
define amdgpu_ps bfloat @test_clamp_bf16(bfloat %src) {
; GCN-LABEL: test_clamp_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
; GCN-NEXT: ; return to shader part epilog
; TRUE16-LABEL: test_clamp_bf16:
; TRUE16: ; %bb.0:
; TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; TRUE16-NEXT: v_pk_max_num_bf16 v0, v0, v0 op_sel_hi:[0,0] clamp
; TRUE16-NEXT: ; return to shader part epilog
;
; FAKE16-LABEL: test_clamp_bf16:
; FAKE16: ; %bb.0:
; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; FAKE16-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
; FAKE16-NEXT: ; return to shader part epilog
%max = call bfloat @llvm.maxnum.bf16(bfloat %src, bfloat 0.0)
%clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
ret bfloat %clamp
}
define amdgpu_ps bfloat @test_clamp_bf16_s(bfloat inreg %src) {
; GCN-LABEL: test_clamp_bf16_s:
; GCN: ; %bb.0:
; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GCN-NEXT: v_pk_max_num_bf16 v0, s0, s0 clamp
; GCN-NEXT: ; return to shader part epilog
; TRUE16-LABEL: test_clamp_bf16_s:
; TRUE16: ; %bb.0:
; TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
; TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; TRUE16-NEXT: v_pk_max_num_bf16 v0, v0, v0 op_sel_hi:[0,0] clamp
; TRUE16-NEXT: ; return to shader part epilog
;
; FAKE16-LABEL: test_clamp_bf16_s:
; FAKE16: ; %bb.0:
; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; FAKE16-NEXT: v_pk_max_num_bf16 v0, s0, s0 clamp
; FAKE16-NEXT: ; return to shader part epilog
%max = call bfloat @llvm.maxnum.bf16(bfloat %src, bfloat 0.0)
%clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
ret bfloat %clamp
}
define amdgpu_ps float @test_clamp_bf16_hi16(<2 x bfloat> %src) {
; TRUE16-LABEL: test_clamp_bf16_hi16:
; TRUE16: ; %bb.0:
; TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; TRUE16-NEXT: v_pk_max_num_bf16 v1, v0, v0 op_sel_hi:[0,0] clamp
; TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
; TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
; TRUE16-NEXT: ; return to shader part epilog
;
; FAKE16-LABEL: test_clamp_bf16_hi16:
; FAKE16: ; %bb.0:
; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; FAKE16-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
; FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; FAKE16-NEXT: ; return to shader part epilog
%bsrc = extractelement <2 x bfloat> %src, i32 1
%max = call bfloat @llvm.maxnum.bf16(bfloat %bsrc, bfloat 0.0)
%clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
%res = fpext bfloat %clamp to float
ret float %res
}
define amdgpu_ps float @test_clamp_bf16_hi16_s(<2 x bfloat> inreg %src) {
; TRUE16-LABEL: test_clamp_bf16_hi16_s:
; TRUE16: ; %bb.0:
; TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; TRUE16-NEXT: s_lshr_b32 s0, s0, 16
; TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
; TRUE16-NEXT: v_pk_max_num_bf16 v0, v0, v0 op_sel_hi:[0,0] clamp
; TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; TRUE16-NEXT: ; return to shader part epilog
;
; FAKE16-LABEL: test_clamp_bf16_hi16_s:
; FAKE16: ; %bb.0:
; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; FAKE16-NEXT: s_lshr_b32 s0, s0, 16
; FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; FAKE16-NEXT: v_pk_max_num_bf16 v0, s0, s0 clamp
; FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; FAKE16-NEXT: ; return to shader part epilog
%bsrc = extractelement <2 x bfloat> %src, i32 1
%max = call bfloat @llvm.maxnum.bf16(bfloat %bsrc, bfloat 0.0)
%clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
%res = fpext bfloat %clamp to float
ret float %res
}
define amdgpu_ps float @test_clamp_v2bf16(<2 x bfloat> %src) {
; GCN-LABEL: test_clamp_v2bf16:
; GCN: ; %bb.0:
@ -399,14 +481,23 @@ define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) {
}
define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
; GCN-LABEL: test_clamp_bf16_folding:
; GCN: ; %bb.0:
; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GCN-NEXT: v_exp_bf16_e32 v0, v0
; GCN-NEXT: v_nop
; GCN-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
; GCN-NEXT: ; return to shader part epilog
; TRUE16-LABEL: test_clamp_bf16_folding:
; TRUE16: ; %bb.0:
; TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; TRUE16-NEXT: v_exp_bf16_e32 v0.l, v0.l
; TRUE16-NEXT: v_nop
; TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
; TRUE16-NEXT: v_pk_max_num_bf16 v0, v0, v0 op_sel_hi:[0,0] clamp
; TRUE16-NEXT: ; return to shader part epilog
;
; FAKE16-LABEL: test_clamp_bf16_folding:
; FAKE16: ; %bb.0:
; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; FAKE16-NEXT: v_exp_bf16_e32 v0, v0
; FAKE16-NEXT: v_nop
; FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
; FAKE16-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
; FAKE16-NEXT: ; return to shader part epilog
%exp = call bfloat @llvm.exp2.bf16(bfloat %src)
%max = call bfloat @llvm.maxnum.bf16(bfloat %exp, bfloat 0.0)
%clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
@ -560,48 +651,76 @@ define amdgpu_ps void @v_test_fma_v2bf16_vll(ptr addrspace(1) %out, <2 x bfloat>
}
define amdgpu_ps void @llvm_log2_bf16_v(ptr addrspace(1) %out, bfloat %src) {
; GCN-LABEL: llvm_log2_bf16_v:
; GCN: ; %bb.0:
; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GCN-NEXT: v_log_bf16_e32 v2, v2
; GCN-NEXT: global_store_b16 v[0:1], v2, off
; GCN-NEXT: s_endpgm
; TRUE16-LABEL: llvm_log2_bf16_v:
; TRUE16: ; %bb.0:
; TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; TRUE16-NEXT: v_log_bf16_e32 v2.l, v2.l
; TRUE16-NEXT: global_store_b16 v[0:1], v2, off
; TRUE16-NEXT: s_endpgm
;
; FAKE16-LABEL: llvm_log2_bf16_v:
; FAKE16: ; %bb.0:
; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; FAKE16-NEXT: v_log_bf16_e32 v2, v2
; FAKE16-NEXT: global_store_b16 v[0:1], v2, off
; FAKE16-NEXT: s_endpgm
%log = call bfloat @llvm.log2.bf16(bfloat %src)
store bfloat %log, ptr addrspace(1) %out, align 2
ret void
}
define amdgpu_ps void @llvm_log2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src) {
; GCN-LABEL: llvm_log2_bf16_s:
; GCN: ; %bb.0:
; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GCN-NEXT: v_log_bf16_e32 v2, s0
; GCN-NEXT: global_store_b16 v[0:1], v2, off
; GCN-NEXT: s_endpgm
; TRUE16-LABEL: llvm_log2_bf16_s:
; TRUE16: ; %bb.0:
; TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; TRUE16-NEXT: v_log_bf16_e32 v2.l, s0
; TRUE16-NEXT: global_store_b16 v[0:1], v2, off
; TRUE16-NEXT: s_endpgm
;
; FAKE16-LABEL: llvm_log2_bf16_s:
; FAKE16: ; %bb.0:
; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; FAKE16-NEXT: v_log_bf16_e32 v2, s0
; FAKE16-NEXT: global_store_b16 v[0:1], v2, off
; FAKE16-NEXT: s_endpgm
%log = call bfloat @llvm.log2.bf16(bfloat %src)
store bfloat %log, ptr addrspace(1) %out, align 2
ret void
}
define amdgpu_ps void @llvm_exp2_bf16_v(ptr addrspace(1) %out, bfloat %src) {
; GCN-LABEL: llvm_exp2_bf16_v:
; GCN: ; %bb.0:
; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GCN-NEXT: v_exp_bf16_e32 v2, v2
; GCN-NEXT: global_store_b16 v[0:1], v2, off
; GCN-NEXT: s_endpgm
; TRUE16-LABEL: llvm_exp2_bf16_v:
; TRUE16: ; %bb.0:
; TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; TRUE16-NEXT: v_exp_bf16_e32 v2.l, v2.l
; TRUE16-NEXT: global_store_b16 v[0:1], v2, off
; TRUE16-NEXT: s_endpgm
;
; FAKE16-LABEL: llvm_exp2_bf16_v:
; FAKE16: ; %bb.0:
; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; FAKE16-NEXT: v_exp_bf16_e32 v2, v2
; FAKE16-NEXT: global_store_b16 v[0:1], v2, off
; FAKE16-NEXT: s_endpgm
%exp = call bfloat @llvm.exp2.bf16(bfloat %src)
store bfloat %exp, ptr addrspace(1) %out, align 2
ret void
}
define amdgpu_ps void @llvm_exp2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src) {
; GCN-LABEL: llvm_exp2_bf16_s:
; GCN: ; %bb.0:
; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GCN-NEXT: v_exp_bf16_e32 v2, s0
; GCN-NEXT: global_store_b16 v[0:1], v2, off
; GCN-NEXT: s_endpgm
; TRUE16-LABEL: llvm_exp2_bf16_s:
; TRUE16: ; %bb.0:
; TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; TRUE16-NEXT: v_exp_bf16_e32 v2.l, s0
; TRUE16-NEXT: global_store_b16 v[0:1], v2, off
; TRUE16-NEXT: s_endpgm
;
; FAKE16-LABEL: llvm_exp2_bf16_s:
; FAKE16: ; %bb.0:
; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; FAKE16-NEXT: v_exp_bf16_e32 v2, s0
; FAKE16-NEXT: global_store_b16 v[0:1], v2, off
; FAKE16-NEXT: s_endpgm
%exp = call bfloat @llvm.exp2.bf16(bfloat %src)
store bfloat %exp, ptr addrspace(1) %out, align 2
ret void