
GFX950 uses OP_SEL[MSB:LSB] for both src reads and dest writes. So this patch essentially revert the work from https://github.com/llvm/llvm-project/pull/151286 regarding dest writes.
176 lines
8.8 KiB
LLVM
176 lines
8.8 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
|
|
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950 %s
|
|
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950 %s
|
|
|
|
declare i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 %old, <2 x half> %src, i32 %seed, float %scale, i32 %dst_sel)
|
|
declare i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 %old, <2 x bfloat> %src, i32 %seed, float %scale, i32 %dst_sel)
|
|
declare i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 %old, <2 x float> %src, i32 %seed, float %scale, i32 %dst_sel)
|
|
|
|
define amdgpu_ps void @test_scalef32_sr_pk_fp4_f16_dst_sel_0(ptr addrspace(1) %out, <2 x half> %src, i32 %seed, float %scale) {
|
|
; GFX950-LABEL: test_scalef32_sr_pk_fp4_f16_dst_sel_0:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: global_load_dword v5, v[0:1], off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f16 v6, v2, v3, v4
|
|
; GFX950-NEXT: global_store_dword v[0:1], v6, off
|
|
; GFX950-NEXT: s_endpgm
|
|
%old = load i32, ptr addrspace(1) %out, align 4
|
|
%cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 %old, <2 x half> %src, i32 %seed, float %scale, i32 0)
|
|
store i32 %cvt, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scalef32_sr_pk_fp4_f16_dst_sel_1(ptr addrspace(1) %out, <2 x half> %src, i32 %seed, float %scale) {
|
|
; GFX950-LABEL: test_scalef32_sr_pk_fp4_f16_dst_sel_1:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: global_load_dword v5, v[0:1], off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f16 v6, v2, v3, v4 op_sel:[0,0,1,0]
|
|
; GFX950-NEXT: global_store_dword v[0:1], v6, off
|
|
; GFX950-NEXT: s_endpgm
|
|
%old = load i32, ptr addrspace(1) %out, align 4
|
|
%cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 %old, <2 x half> %src, i32 %seed, float %scale, i32 1)
|
|
store i32 %cvt, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scalef32_sr_pk_fp4_f16_dst_sel_2(ptr addrspace(1) %out, <2 x half> %src, i32 %seed, float %scale) {
|
|
; GFX950-LABEL: test_scalef32_sr_pk_fp4_f16_dst_sel_2:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: global_load_dword v5, v[0:1], off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f16 v6, v2, v3, v4 op_sel:[0,0,0,1]
|
|
; GFX950-NEXT: global_store_dword v[0:1], v6, off
|
|
; GFX950-NEXT: s_endpgm
|
|
%old = load i32, ptr addrspace(1) %out, align 4
|
|
%cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 %old, <2 x half> %src, i32 %seed, float %scale, i32 2)
|
|
store i32 %cvt, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scalef32_sr_pk_fp4_f16_dst_sel_3(ptr addrspace(1) %out, <2 x half> %src, i32 %seed, float %scale) {
|
|
; GFX950-LABEL: test_scalef32_sr_pk_fp4_f16_dst_sel_3:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: global_load_dword v5, v[0:1], off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f16 v6, v2, v3, v4 op_sel:[0,0,1,1]
|
|
; GFX950-NEXT: global_store_dword v[0:1], v6, off
|
|
; GFX950-NEXT: s_endpgm
|
|
%old = load i32, ptr addrspace(1) %out, align 4
|
|
%cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 %old, <2 x half> %src, i32 %seed, float %scale, i32 3)
|
|
store i32 %cvt, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scalef32_sr_pk_fp4_bf16_dst_sel_0(ptr addrspace(1) %out, <2 x bfloat> %src, i32 %seed, float %scale) {
|
|
; GFX950-LABEL: test_scalef32_sr_pk_fp4_bf16_dst_sel_0:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: global_load_dword v5, v[0:1], off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_bf16 v6, v2, v3, v4
|
|
; GFX950-NEXT: global_store_dword v[0:1], v6, off
|
|
; GFX950-NEXT: s_endpgm
|
|
%old = load i32, ptr addrspace(1) %out, align 4
|
|
%cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 %old, <2 x bfloat> %src, i32 %seed, float %scale, i32 0)
|
|
store i32 %cvt, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scalef32_sr_pk_fp4_bf16_dst_sel_1(ptr addrspace(1) %out, <2 x bfloat> %src, i32 %seed, float %scale) {
|
|
; GFX950-LABEL: test_scalef32_sr_pk_fp4_bf16_dst_sel_1:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: global_load_dword v5, v[0:1], off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_bf16 v6, v2, v3, v4 op_sel:[0,0,1,0]
|
|
; GFX950-NEXT: global_store_dword v[0:1], v6, off
|
|
; GFX950-NEXT: s_endpgm
|
|
%old = load i32, ptr addrspace(1) %out, align 4
|
|
%cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 %old, <2 x bfloat> %src, i32 %seed, float %scale, i32 1)
|
|
store i32 %cvt, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scalef32_sr_pk_fp4_bf16_dst_sel_2(ptr addrspace(1) %out, <2 x bfloat> %src, i32 %seed, float %scale) {
|
|
; GFX950-LABEL: test_scalef32_sr_pk_fp4_bf16_dst_sel_2:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: global_load_dword v5, v[0:1], off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_bf16 v6, v2, v3, v4 op_sel:[0,0,0,1]
|
|
; GFX950-NEXT: global_store_dword v[0:1], v6, off
|
|
; GFX950-NEXT: s_endpgm
|
|
%old = load i32, ptr addrspace(1) %out, align 4
|
|
%cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 %old, <2 x bfloat> %src, i32 %seed, float %scale, i32 2)
|
|
store i32 %cvt, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scalef32_sr_pk_fp4_bf16_dst_sel_3(ptr addrspace(1) %out, <2 x bfloat> %src, i32 %seed, float %scale) {
|
|
; GFX950-LABEL: test_scalef32_sr_pk_fp4_bf16_dst_sel_3:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: global_load_dword v5, v[0:1], off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_bf16 v6, v2, v3, v4 op_sel:[0,0,1,1]
|
|
; GFX950-NEXT: global_store_dword v[0:1], v6, off
|
|
; GFX950-NEXT: s_endpgm
|
|
%old = load i32, ptr addrspace(1) %out, align 4
|
|
%cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 %old, <2 x bfloat> %src, i32 %seed, float %scale, i32 3)
|
|
store i32 %cvt, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scalef32_sr_pk_fp4_f32_dst_sel_0(ptr addrspace(1) %out, <2 x float> %src, i32 %seed, float %scale) {
|
|
; GFX950-LABEL: test_scalef32_sr_pk_fp4_f32_dst_sel_0:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: global_load_dword v6, v[0:1], off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f32 v7, v[2:3], v4, v5
|
|
; GFX950-NEXT: global_store_dword v[0:1], v7, off
|
|
; GFX950-NEXT: s_endpgm
|
|
%old = load i32, ptr addrspace(1) %out, align 4
|
|
%cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 %old, <2 x float> %src, i32 %seed, float %scale, i32 0)
|
|
store i32 %cvt, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scalef32_sr_pk_fp4_f32_dst_sel_1(ptr addrspace(1) %out, <2 x float> %src, i32 %seed, float %scale) {
|
|
; GFX950-LABEL: test_scalef32_sr_pk_fp4_f32_dst_sel_1:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: global_load_dword v6, v[0:1], off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f32 v7, v[2:3], v4, v5 op_sel:[0,0,1,0]
|
|
; GFX950-NEXT: global_store_dword v[0:1], v7, off
|
|
; GFX950-NEXT: s_endpgm
|
|
%old = load i32, ptr addrspace(1) %out, align 4
|
|
%cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 %old, <2 x float> %src, i32 %seed, float %scale, i32 1)
|
|
store i32 %cvt, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scalef32_sr_pk_fp4_f32_dst_sel_2(ptr addrspace(1) %out, <2 x float> %src, i32 %seed, float %scale) {
|
|
; GFX950-LABEL: test_scalef32_sr_pk_fp4_f32_dst_sel_2:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: global_load_dword v6, v[0:1], off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f32 v7, v[2:3], v4, v5 op_sel:[0,0,0,1]
|
|
; GFX950-NEXT: global_store_dword v[0:1], v7, off
|
|
; GFX950-NEXT: s_endpgm
|
|
%old = load i32, ptr addrspace(1) %out, align 4
|
|
%cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 %old, <2 x float> %src, i32 %seed, float %scale, i32 2)
|
|
store i32 %cvt, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_scalef32_sr_pk_fp4_f32_dst_sel_3(ptr addrspace(1) %out, <2 x float> %src, i32 %seed, float %scale) {
|
|
; GFX950-LABEL: test_scalef32_sr_pk_fp4_f32_dst_sel_3:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: global_load_dword v6, v[0:1], off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f32 v7, v[2:3], v4, v5 op_sel:[0,0,1,1]
|
|
; GFX950-NEXT: global_store_dword v[0:1], v7, off
|
|
; GFX950-NEXT: s_endpgm
|
|
%old = load i32, ptr addrspace(1) %out, align 4
|
|
%cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 %old, <2 x float> %src, i32 %seed, float %scale, i32 3)
|
|
store i32 %cvt, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|