llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sr.ll
David Green 5a81a559d6
[GISel] Explicitly disable BF16 tablegen patterns. (#124113)
We currently have an issue where bf16 patters can be used to match fp16
types, as GISel does not know about the difference between the two. This
patch explicitly disables them to make sure that they are never used.

The opposite can also happen too, where fp16 patterns are used for
operators that should be bf16. So this also changes any operations with
bf16 types to now cause a fallback to SDAG.

The pass setup for GISel has been slightly adjusted to make sure that a
verify pass does not get added between AMD-SDAG and SIFixSGPRCopiesPass,
which otherwise can cause verifier issues when falling back.
2025-01-27 22:21:12 +00:00

127 lines
5.8 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -o - %s | FileCheck -check-prefix=GFX950 %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -o - %s | FileCheck -check-prefix=GFX950 %s
declare <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half>, float, i32, i1)
declare <2 x bfloat> @llvm.amdgcn.cvt.sr.bf16.f32(<2 x bfloat>, float, i32, i1)
define amdgpu_ps void @test_cvt_sr_bf16_f32_word_sel_0(ptr addrspace(1) %out, float %src0, i32 %src1) {
; GFX950-LABEL: test_cvt_sr_bf16_f32_word_sel_0:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v4, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_sr_bf16_f32 v4, v2, v3
; GFX950-NEXT: global_store_dword v[0:1], v4, off
; GFX950-NEXT: s_endpgm
%old = load <2 x bfloat>, ptr addrspace(1) %out, align 4
%cvt = tail call <2 x bfloat> @llvm.amdgcn.cvt.sr.bf16.f32(<2 x bfloat> %old, float %src0, i32 %src1, i1 false)
store <2 x bfloat> %cvt, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_ps void @test_cvt_sr_bf16_f32_word_sel_1(ptr addrspace(1) %out, float %src0, i32 %src1) {
; GFX950-LABEL: test_cvt_sr_bf16_f32_word_sel_1:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v4, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_sr_bf16_f32 v4, v2, v3 op_sel:[0,0,1]
; GFX950-NEXT: global_store_dword v[0:1], v4, off
; GFX950-NEXT: s_endpgm
%old = load <2 x bfloat>, ptr addrspace(1) %out, align 4
%cvt = tail call <2 x bfloat> @llvm.amdgcn.cvt.sr.bf16.f32(<2 x bfloat> %old, float %src0, i32 %src1, i1 true)
store <2 x bfloat> %cvt, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_ps void @test_cvt_sr_bf16_f32_fabs(ptr addrspace(1) %out, float %src0, i32 %src1) {
; GFX950-LABEL: test_cvt_sr_bf16_f32_fabs:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v4, v[0:1], off
; GFX950-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_sr_bf16_f32 v4, v2, v3
; GFX950-NEXT: global_store_dword v[0:1], v4, off
; GFX950-NEXT: s_endpgm
%old = load <2 x bfloat>, ptr addrspace(1) %out, align 4
%src0.fabs = call float @llvm.fabs.f32(float %src0)
%cvt = tail call <2 x bfloat> @llvm.amdgcn.cvt.sr.bf16.f32(<2 x bfloat> %old, float %src0.fabs, i32 %src1, i1 false)
store <2 x bfloat> %cvt, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_ps void @test_cvt_sr_bf16_f32_fneg(ptr addrspace(1) %out, float %src0, i32 %src1) {
; GFX950-LABEL: test_cvt_sr_bf16_f32_fneg:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v4, v[0:1], off
; GFX950-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_sr_bf16_f32 v4, v2, v3
; GFX950-NEXT: global_store_dword v[0:1], v4, off
; GFX950-NEXT: s_endpgm
%old = load <2 x bfloat>, ptr addrspace(1) %out, align 4
%src0.fneg = fneg float %src0
%cvt = tail call <2 x bfloat> @llvm.amdgcn.cvt.sr.bf16.f32(<2 x bfloat> %old, float %src0.fneg, i32 %src1, i1 false)
store <2 x bfloat> %cvt, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_ps void @test_cvt_sr_f16_f32_word_sel_0(ptr addrspace(1) %out, float %src0, i32 %src1) {
; GFX950-LABEL: test_cvt_sr_f16_f32_word_sel_0:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v4, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_sr_f16_f32 v4, v2, v3
; GFX950-NEXT: global_store_dword v[0:1], v4, off
; GFX950-NEXT: s_endpgm
%old = load <2 x half>, ptr addrspace(1) %out, align 4
%cvt = tail call <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half> %old, float %src0, i32 %src1, i1 false)
store <2 x half> %cvt, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_ps void @test_cvt_sr_f16_f32_word_sel_1(ptr addrspace(1) %out, float %src0, i32 %src1) {
; GFX950-LABEL: test_cvt_sr_f16_f32_word_sel_1:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v4, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_sr_f16_f32 v4, v2, v3 op_sel:[0,0,1]
; GFX950-NEXT: global_store_dword v[0:1], v4, off
; GFX950-NEXT: s_endpgm
%old = load <2 x half>, ptr addrspace(1) %out, align 4
%cvt = tail call <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half> %old, float %src0, i32 %src1, i1 true)
store <2 x half> %cvt, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_ps void @test_cvt_sr_f16_f32_fabs(ptr addrspace(1) %out, float %src0, i32 %src1) {
; GFX950-LABEL: test_cvt_sr_f16_f32_fabs:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v4, v[0:1], off
; GFX950-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_sr_f16_f32 v4, v2, v3
; GFX950-NEXT: global_store_dword v[0:1], v4, off
; GFX950-NEXT: s_endpgm
%old = load <2 x half>, ptr addrspace(1) %out, align 4
%src0.fabs = call float @llvm.fabs.f32(float %src0)
%cvt = tail call <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half> %old, float %src0.fabs, i32 %src1, i1 false)
store <2 x half> %cvt, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_ps void @test_cvt_sr_f16_f32_fneg(ptr addrspace(1) %out, float %src0, i32 %src1) {
; GFX950-LABEL: test_cvt_sr_f16_f32_fneg:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v4, v[0:1], off
; GFX950-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_sr_f16_f32 v4, v2, v3
; GFX950-NEXT: global_store_dword v[0:1], v4, off
; GFX950-NEXT: s_endpgm
%old = load <2 x half>, ptr addrspace(1) %out, align 4
%src0.fneg = fneg float %src0
%cvt = tail call <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half> %old, float %src0.fneg, i32 %src1, i1 false)
store <2 x half> %cvt, ptr addrspace(1) %out, align 8
ret void
}