
We currently have an issue where bf16 patters can be used to match fp16 types, as GISel does not know about the difference between the two. This patch explicitly disables them to make sure that they are never used. The opposite can also happen too, where fp16 patterns are used for operators that should be bf16. So this also changes any operations with bf16 types to now cause a fallback to SDAG. The pass setup for GISel has been slightly adjusted to make sure that a verify pass does not get added between AMD-SDAG and SIFixSGPRCopiesPass, which otherwise can cause verifier issues when falling back.
127 lines
5.8 KiB
LLVM
127 lines
5.8 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
|
|
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -o - %s | FileCheck -check-prefix=GFX950 %s
|
|
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -o - %s | FileCheck -check-prefix=GFX950 %s
|
|
|
|
declare <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half>, float, i32, i1)
|
|
declare <2 x bfloat> @llvm.amdgcn.cvt.sr.bf16.f32(<2 x bfloat>, float, i32, i1)
|
|
|
|
define amdgpu_ps void @test_cvt_sr_bf16_f32_word_sel_0(ptr addrspace(1) %out, float %src0, i32 %src1) {
|
|
; GFX950-LABEL: test_cvt_sr_bf16_f32_word_sel_0:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: global_load_dword v4, v[0:1], off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cvt_sr_bf16_f32 v4, v2, v3
|
|
; GFX950-NEXT: global_store_dword v[0:1], v4, off
|
|
; GFX950-NEXT: s_endpgm
|
|
%old = load <2 x bfloat>, ptr addrspace(1) %out, align 4
|
|
%cvt = tail call <2 x bfloat> @llvm.amdgcn.cvt.sr.bf16.f32(<2 x bfloat> %old, float %src0, i32 %src1, i1 false)
|
|
store <2 x bfloat> %cvt, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_cvt_sr_bf16_f32_word_sel_1(ptr addrspace(1) %out, float %src0, i32 %src1) {
|
|
; GFX950-LABEL: test_cvt_sr_bf16_f32_word_sel_1:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: global_load_dword v4, v[0:1], off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cvt_sr_bf16_f32 v4, v2, v3 op_sel:[0,0,1]
|
|
; GFX950-NEXT: global_store_dword v[0:1], v4, off
|
|
; GFX950-NEXT: s_endpgm
|
|
%old = load <2 x bfloat>, ptr addrspace(1) %out, align 4
|
|
%cvt = tail call <2 x bfloat> @llvm.amdgcn.cvt.sr.bf16.f32(<2 x bfloat> %old, float %src0, i32 %src1, i1 true)
|
|
store <2 x bfloat> %cvt, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_cvt_sr_bf16_f32_fabs(ptr addrspace(1) %out, float %src0, i32 %src1) {
|
|
; GFX950-LABEL: test_cvt_sr_bf16_f32_fabs:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: global_load_dword v4, v[0:1], off
|
|
; GFX950-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cvt_sr_bf16_f32 v4, v2, v3
|
|
; GFX950-NEXT: global_store_dword v[0:1], v4, off
|
|
; GFX950-NEXT: s_endpgm
|
|
%old = load <2 x bfloat>, ptr addrspace(1) %out, align 4
|
|
%src0.fabs = call float @llvm.fabs.f32(float %src0)
|
|
%cvt = tail call <2 x bfloat> @llvm.amdgcn.cvt.sr.bf16.f32(<2 x bfloat> %old, float %src0.fabs, i32 %src1, i1 false)
|
|
store <2 x bfloat> %cvt, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_cvt_sr_bf16_f32_fneg(ptr addrspace(1) %out, float %src0, i32 %src1) {
|
|
; GFX950-LABEL: test_cvt_sr_bf16_f32_fneg:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: global_load_dword v4, v[0:1], off
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cvt_sr_bf16_f32 v4, v2, v3
|
|
; GFX950-NEXT: global_store_dword v[0:1], v4, off
|
|
; GFX950-NEXT: s_endpgm
|
|
%old = load <2 x bfloat>, ptr addrspace(1) %out, align 4
|
|
%src0.fneg = fneg float %src0
|
|
%cvt = tail call <2 x bfloat> @llvm.amdgcn.cvt.sr.bf16.f32(<2 x bfloat> %old, float %src0.fneg, i32 %src1, i1 false)
|
|
store <2 x bfloat> %cvt, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_cvt_sr_f16_f32_word_sel_0(ptr addrspace(1) %out, float %src0, i32 %src1) {
|
|
; GFX950-LABEL: test_cvt_sr_f16_f32_word_sel_0:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: global_load_dword v4, v[0:1], off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cvt_sr_f16_f32 v4, v2, v3
|
|
; GFX950-NEXT: global_store_dword v[0:1], v4, off
|
|
; GFX950-NEXT: s_endpgm
|
|
%old = load <2 x half>, ptr addrspace(1) %out, align 4
|
|
%cvt = tail call <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half> %old, float %src0, i32 %src1, i1 false)
|
|
store <2 x half> %cvt, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_cvt_sr_f16_f32_word_sel_1(ptr addrspace(1) %out, float %src0, i32 %src1) {
|
|
; GFX950-LABEL: test_cvt_sr_f16_f32_word_sel_1:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: global_load_dword v4, v[0:1], off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cvt_sr_f16_f32 v4, v2, v3 op_sel:[0,0,1]
|
|
; GFX950-NEXT: global_store_dword v[0:1], v4, off
|
|
; GFX950-NEXT: s_endpgm
|
|
%old = load <2 x half>, ptr addrspace(1) %out, align 4
|
|
%cvt = tail call <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half> %old, float %src0, i32 %src1, i1 true)
|
|
store <2 x half> %cvt, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_cvt_sr_f16_f32_fabs(ptr addrspace(1) %out, float %src0, i32 %src1) {
|
|
; GFX950-LABEL: test_cvt_sr_f16_f32_fabs:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: global_load_dword v4, v[0:1], off
|
|
; GFX950-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cvt_sr_f16_f32 v4, v2, v3
|
|
; GFX950-NEXT: global_store_dword v[0:1], v4, off
|
|
; GFX950-NEXT: s_endpgm
|
|
%old = load <2 x half>, ptr addrspace(1) %out, align 4
|
|
%src0.fabs = call float @llvm.fabs.f32(float %src0)
|
|
%cvt = tail call <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half> %old, float %src0.fabs, i32 %src1, i1 false)
|
|
store <2 x half> %cvt, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_cvt_sr_f16_f32_fneg(ptr addrspace(1) %out, float %src0, i32 %src1) {
|
|
; GFX950-LABEL: test_cvt_sr_f16_f32_fneg:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: global_load_dword v4, v[0:1], off
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cvt_sr_f16_f32 v4, v2, v3
|
|
; GFX950-NEXT: global_store_dword v[0:1], v4, off
|
|
; GFX950-NEXT: s_endpgm
|
|
%old = load <2 x half>, ptr addrspace(1) %out, align 4
|
|
%src0.fneg = fneg float %src0
|
|
%cvt = tail call <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half> %old, float %src0.fneg, i32 %src1, i1 false)
|
|
store <2 x half> %cvt, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|