3786 lines
182 KiB
LLVM
3786 lines
182 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GFX900 %s
|
|
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG,GFX90A-SDAG %s
|
|
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL,GFX90A-GISEL %s
|
|
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG,GFX942-SDAG %s
|
|
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL,GFX942-GISEL %s
|
|
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250,GFX1250-SDAG %s
|
|
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250,GFX1250-GISEL %s
|
|
|
|
define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) {
|
|
; GFX900-LABEL: fadd_v2_vv:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_add_f32_e32 v1, v1, v1
|
|
; GFX900-NEXT: v_add_f32_e32 v0, v0, v0
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-LABEL: fadd_v2_vv:
|
|
; PACKED: ; %bb.0:
|
|
; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[0:1]
|
|
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: fadd_v2_vv:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[0:1]
|
|
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%add = fadd <2 x float> %load, %load
|
|
store <2 x float> %add, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
|
|
; GFX900-LABEL: fadd_v2_vs:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_add_f32_e32 v1, s3, v1
|
|
; GFX900-NEXT: v_add_f32_e32 v0, s2, v0
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-LABEL: fadd_v2_vs:
|
|
; PACKED: ; %bb.0:
|
|
; PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
|
|
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: fadd_v2_vs:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
|
|
; GFX1250-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%add = fadd <2 x float> %load, %x
|
|
store <2 x float> %add, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
|
|
; GFX900-LABEL: fadd_v4_vs:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_add_f32_e32 v3, s3, v3
|
|
; GFX900-NEXT: v_add_f32_e32 v2, s2, v2
|
|
; GFX900-NEXT: v_add_f32_e32 v1, s1, v1
|
|
; GFX900-NEXT: v_add_f32_e32 v0, s0, v0
|
|
; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fadd_v4_vs:
|
|
; PACKED-SDAG: ; %bb.0:
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
|
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 4, v0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[2:3]
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[0:1]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: fadd_v4_vs:
|
|
; PACKED-GISEL: ; %bb.0:
|
|
; PACKED-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
|
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 4, v0
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[0:1]
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[2:3]
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fadd_v4_vs:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_clause 0x1
|
|
; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5]
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[6:7]
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fadd_v4_vs:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_clause 0x1
|
|
; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[4:5]
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[6:7]
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <4 x float>, ptr addrspace(1) %gep, align 16
|
|
%add = fadd <4 x float> %load, %x
|
|
store <4 x float> %add, ptr addrspace(1) %gep, align 16
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
|
; GFX900-LABEL: fadd_v32_vs:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
|
|
; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
|
|
; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
|
|
; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
|
|
; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
|
|
; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
|
|
; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
|
; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
|
; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
|
|
; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
|
|
; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
|
|
; GFX900-NEXT: v_add_f32_e32 v4, s43, v4
|
|
; GFX900-NEXT: v_add_f32_e32 v3, s42, v3
|
|
; GFX900-NEXT: v_add_f32_e32 v2, s41, v2
|
|
; GFX900-NEXT: v_add_f32_e32 v1, s40, v1
|
|
; GFX900-NEXT: s_waitcnt vmcnt(6)
|
|
; GFX900-NEXT: v_add_f32_e32 v8, s39, v8
|
|
; GFX900-NEXT: v_add_f32_e32 v7, s38, v7
|
|
; GFX900-NEXT: v_add_f32_e32 v6, s37, v6
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_add_f32_e32 v32, s19, v32
|
|
; GFX900-NEXT: v_add_f32_e32 v31, s18, v31
|
|
; GFX900-NEXT: v_add_f32_e32 v30, s17, v30
|
|
; GFX900-NEXT: v_add_f32_e32 v29, s16, v29
|
|
; GFX900-NEXT: v_add_f32_e32 v5, s36, v5
|
|
; GFX900-NEXT: v_add_f32_e32 v12, s51, v12
|
|
; GFX900-NEXT: v_add_f32_e32 v11, s50, v11
|
|
; GFX900-NEXT: v_add_f32_e32 v10, s49, v10
|
|
; GFX900-NEXT: v_add_f32_e32 v9, s48, v9
|
|
; GFX900-NEXT: v_add_f32_e32 v16, s47, v16
|
|
; GFX900-NEXT: v_add_f32_e32 v15, s46, v15
|
|
; GFX900-NEXT: v_add_f32_e32 v14, s45, v14
|
|
; GFX900-NEXT: v_add_f32_e32 v13, s44, v13
|
|
; GFX900-NEXT: v_add_f32_e32 v20, s15, v20
|
|
; GFX900-NEXT: v_add_f32_e32 v19, s14, v19
|
|
; GFX900-NEXT: v_add_f32_e32 v18, s13, v18
|
|
; GFX900-NEXT: v_add_f32_e32 v17, s12, v17
|
|
; GFX900-NEXT: v_add_f32_e32 v24, s11, v24
|
|
; GFX900-NEXT: v_add_f32_e32 v23, s10, v23
|
|
; GFX900-NEXT: v_add_f32_e32 v22, s9, v22
|
|
; GFX900-NEXT: v_add_f32_e32 v21, s8, v21
|
|
; GFX900-NEXT: v_add_f32_e32 v28, s23, v28
|
|
; GFX900-NEXT: v_add_f32_e32 v27, s22, v27
|
|
; GFX900-NEXT: v_add_f32_e32 v26, s21, v26
|
|
; GFX900-NEXT: v_add_f32_e32 v25, s20, v25
|
|
; GFX900-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:96
|
|
; GFX900-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:112
|
|
; GFX900-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:64
|
|
; GFX900-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80
|
|
; GFX900-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:32
|
|
; GFX900-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:48
|
|
; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1]
|
|
; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:16
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fadd_v32_vs:
|
|
; PACKED-SDAG: ; %bb.0:
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:32
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
|
|
; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
|
; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[40:41]
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[42:43]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6)
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[6:7], v[6:7], s[38:39]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(5)
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[8:9], v[8:9], s[48:49]
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[10:11], v[10:11], s[50:51]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(4)
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[16:17], v[16:17], s[44:45]
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[18:19], v[18:19], s[46:47]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[28:29], v[28:29], s[16:17]
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[30:31], v[30:31], s[18:19]
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[20:21], v[20:21], s[12:13]
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[22:23], v[22:23], s[14:15]
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[14:15], v[14:15], s[10:11]
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[24:25], v[24:25], s[20:21]
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[26:27], v[26:27], s[22:23]
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[4:5], v[4:5], s[36:37]
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[12:13], v[12:13], s[8:9]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:64
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: fadd_v32_vs:
|
|
; PACKED-GISEL: ; %bb.0:
|
|
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
|
|
; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
|
; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[36:37]
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[38:39]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6)
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[4:5], v[4:5], s[40:41]
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[6:7], v[6:7], s[42:43]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(5)
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[8:9], v[8:9], s[44:45]
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[10:11], v[10:11], s[46:47]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(4)
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[12:13], v[12:13], s[48:49]
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[14:15], v[14:15], s[50:51]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(3)
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[16:17], v[16:17], s[8:9]
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[18:19], v[18:19], s[10:11]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(2)
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[20:21], v[20:21], s[12:13]
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[22:23], v[22:23], s[14:15]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(1)
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[24:25], v[24:25], s[16:17]
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[26:27], v[26:27], s[18:19]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[28:29], v[28:29], s[20:21]
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[30:31], v[30:31], s[22:23]
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fadd_v32_vs:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: s_clause 0x7
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[34:35] offset:16
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:48
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[34:35]
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:80
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:96
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:64
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
|
|
; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
|
|
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v33, s21
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s22 :: v_dual_mov_b32 v35, s23
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s18 :: v_dual_mov_b32 v39, s29
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s30 :: v_dual_mov_b32 v41, s31
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s24 :: v_dual_mov_b32 v37, s19
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s28 :: v_dual_mov_b32 v55, s15
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s3 :: v_dual_mov_b32 v52, s12
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s13 :: v_dual_mov_b32 v54, s14
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s7 :: v_dual_mov_b32 v50, s2
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s27 :: v_dual_mov_b32 v46, s4
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s5 :: v_dual_mov_b32 v48, s6
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s25 :: v_dual_mov_b32 v44, s26
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33]
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35]
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s8 :: v_dual_mov_b32 v33, s9
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s10 :: v_dual_mov_b32 v35, s11
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[40:41]
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[0:1]
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[38:39]
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[16:17]
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[20:21], v[20:21], v[32:33]
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[22:23], v[22:23], v[34:35]
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[28:29], v[28:29], v[52:53]
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[30:31], v[30:31], v[54:55]
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[26:27], v[26:27], v[50:51]
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[24:25], v[24:25], v[40:41]
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[16:17], v[16:17], v[46:47]
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[18:19], v[18:19], v[48:49]
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[42:43]
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[44:45]
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[14:15], v[14:15], v[36:37]
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[38:39]
|
|
; GFX1250-SDAG-NEXT: s_clause 0x7
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:96
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:64
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:80
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:48
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[34:35]
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[34:35] offset:16
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fadd_v32_vs:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: s_clause 0x7
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35]
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
|
|
; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
|
|
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33]
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[36:37]
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[38:39]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[40:41]
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[42:43]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x4
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[44:45]
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[14:15], v[14:15], v[46:47]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x3
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[16:17], v[16:17], v[48:49]
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[18:19], v[18:19], v[50:51]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x2
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[20:21], v[20:21], v[52:53]
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[22:23], v[22:23], v[54:55]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x1
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[24:25], v[24:25], v[32:33]
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[26:27], v[26:27], v[34:35]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[28:29], v[28:29], v[36:37]
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[30:31], v[30:31], v[38:39]
|
|
; GFX1250-GISEL-NEXT: s_clause 0x7
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35]
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[34:35] offset:48
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:64
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:80
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:96
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <32 x float>, ptr addrspace(1) %gep, align 128
|
|
%add = fadd <32 x float> %load, %x
|
|
store <32 x float> %add, ptr addrspace(1) %gep, align 128
|
|
ret void
|
|
}
|
|
|
|
; FIXME: GISel does not use op_sel for splat constants.
|
|
define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) {
|
|
; GFX900-LABEL: fadd_v2_v_imm:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_add_f32_e32 v1, 0x42c80000, v1
|
|
; GFX900-NEXT: v_add_f32_e32 v0, 0x42c80000, v0
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fadd_v2_v_imm:
|
|
; PACKED-SDAG: ; %bb.0:
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-SDAG-NEXT: s_mov_b32 s2, 0x42c80000
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: fadd_v2_v_imm:
|
|
; PACKED-GISEL: ; %bb.0:
|
|
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-GISEL-NEXT: s_mov_b32 s2, 0x42c80000
|
|
; PACKED-GISEL-NEXT: s_mov_b32 s3, s2
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
|
|
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fadd_v2_v_imm:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: s_mov_b32 s2, 0x42c80000
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0]
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fadd_v2_v_imm:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0x42c80000
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%add = fadd <2 x float> %load, <float 100.0, float 100.0>
|
|
store <2 x float> %add, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) {
|
|
; GFX900-LABEL: fadd_v2_v_v_splat:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v3, 3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_add_f32_e32 v2, v2, v0
|
|
; GFX900-NEXT: v_add_f32_e32 v1, v1, v0
|
|
; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fadd_v2_v_v_splat:
|
|
; PACKED-SDAG: ; %bb.0:
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[2:3], v[0:1] op_sel_hi:[1,0]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: fadd_v2_v_v_splat:
|
|
; PACKED-GISEL: ; %bb.0:
|
|
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
|
; PACKED-GISEL-NEXT: v_mov_b32_e32 v1, v0
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[2:3], v[0:1]
|
|
; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fadd_v2_v_v_splat:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: global_load_b64 v[2:3], v0, s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[0:1] op_sel_hi:[1,0]
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fadd_v2_v_v_splat:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: global_load_b64 v[2:3], v0, s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[0:1]
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%fid = bitcast i32 %id to float
|
|
%tmp1 = insertelement <2 x float> poison, float %fid, i64 0
|
|
%k = insertelement <2 x float> %tmp1, float %fid, i64 1
|
|
%add = fadd <2 x float> %load, %k
|
|
store <2 x float> %add, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) {
|
|
; GFX900-LABEL: fadd_v2_v_lit_splat:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_add_f32_e32 v1, 1.0, v1
|
|
; GFX900-NEXT: v_add_f32_e32 v0, 1.0, v0
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fadd_v2_v_lit_splat:
|
|
; PACKED-SDAG: ; %bb.0:
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], 1.0 op_sel_hi:[1,0]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: fadd_v2_v_lit_splat:
|
|
; PACKED-GISEL: ; %bb.0:
|
|
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-GISEL-NEXT: s_mov_b32 s2, 1.0
|
|
; PACKED-GISEL-NEXT: s_mov_b32 s3, s2
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
|
|
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fadd_v2_v_lit_splat:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], 1.0 op_sel_hi:[1,0]
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fadd_v2_v_lit_splat:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1.0
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%add = fadd <2 x float> %load, <float 1.0, float 1.0>
|
|
store <2 x float> %add, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) {
|
|
; GFX900-LABEL: fadd_v2_v_lit_hi0:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_add_f32_e32 v1, 0, v1
|
|
; GFX900-NEXT: v_add_f32_e32 v0, 1.0, v0
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-LABEL: fadd_v2_v_lit_hi0:
|
|
; PACKED: ; %bb.0:
|
|
; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-NEXT: s_mov_b64 s[2:3], 0x3f800000
|
|
; PACKED-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
|
|
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fadd_v2_v_lit_hi0:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], 1.0
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fadd_v2_v_lit_hi0:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], 0x3f800000
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%add = fadd <2 x float> %load, <float 1.0, float 0.0>
|
|
store <2 x float> %add, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fadd_v2_v_lit_lo0(ptr addrspace(1) %a) {
|
|
; GFX900-LABEL: fadd_v2_v_lit_lo0:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_add_f32_e32 v1, 1.0, v1
|
|
; GFX900-NEXT: v_add_f32_e32 v0, 0, v0
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-LABEL: fadd_v2_v_lit_lo0:
|
|
; PACKED: ; %bb.0:
|
|
; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-NEXT: s_mov_b32 s2, 0
|
|
; PACKED-NEXT: s_mov_b32 s3, 1.0
|
|
; PACKED-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
|
|
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fadd_v2_v_lit_lo0:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], lit64(0x3f80000000000000)
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fadd_v2_v_lit_lo0:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], lit64(0x3f80000000000000)
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%add = fadd <2 x float> %load, <float 0.0, float 1.0>
|
|
store <2 x float> %add, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
|
|
; GFX900-LABEL: fadd_v2_v_unfoldable_lit:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_add_f32_e32 v1, 2.0, v1
|
|
; GFX900-NEXT: v_add_f32_e32 v0, 1.0, v0
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-LABEL: fadd_v2_v_unfoldable_lit:
|
|
; PACKED: ; %bb.0:
|
|
; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-NEXT: s_mov_b32 s2, 1.0
|
|
; PACKED-NEXT: s_mov_b32 s3, 2.0
|
|
; PACKED-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
|
|
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fadd_v2_v_unfoldable_lit:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], lit64(0x400000003f800000)
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fadd_v2_v_unfoldable_lit:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], lit64(0x400000003f800000)
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%add = fadd <2 x float> %load, <float 1.0, float 2.0>
|
|
store <2 x float> %add, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
; FIXME: Fold fneg into v_pk_add_f32 with Global ISel.
|
|
define amdgpu_kernel void @fadd_v2_v_fneg(ptr addrspace(1) %a, float %x) {
|
|
; GFX900-LABEL: fadd_v2_v_fneg:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: s_load_dword s2, s[4:5], 0x2c
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_subrev_f32_e32 v1, s2, v1
|
|
; GFX900-NEXT: v_subrev_f32_e32 v0, s2, v0
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fadd_v2_v_fneg:
|
|
; PACKED-SDAG: ; %bb.0:
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c
|
|
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: fadd_v2_v_fneg:
|
|
; PACKED-GISEL: ; %bb.0:
|
|
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
|
|
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
|
|
; PACKED-GISEL-NEXT: v_max_f32_e64 v2, -s2, -s2
|
|
; PACKED-GISEL-NEXT: v_mov_b32_e32 v3, v2
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
|
|
; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fadd_v2_v_fneg:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fadd_v2_v_fneg:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: v_max_num_f32_e64 v2, -s2, -s2
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%fneg = fsub float -0.0, %x
|
|
%tmp1 = insertelement <2 x float> poison, float %fneg, i64 0
|
|
%k = insertelement <2 x float> %tmp1, float %fneg, i64 1
|
|
%add = fadd <2 x float> %load, %k
|
|
store <2 x float> %add, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fadd_v2_v_fneg_lo(ptr addrspace(1) %a, float %x) {
|
|
; GFX900-LABEL: fadd_v2_v_fneg_lo:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: s_load_dword s2, s[4:5], 0x2c
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_add_f32_e32 v1, s2, v1
|
|
; GFX900-NEXT: v_subrev_f32_e32 v0, s2, v0
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fadd_v2_v_fneg_lo:
|
|
; PACKED-SDAG: ; %bb.0:
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c
|
|
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_lo:[0,1]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: fadd_v2_v_fneg_lo:
|
|
; PACKED-GISEL: ; %bb.0:
|
|
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
|
|
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
|
|
; PACKED-GISEL-NEXT: v_mov_b32_e32 v3, s2
|
|
; PACKED-GISEL-NEXT: v_max_f32_e64 v2, -s2, -s2
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
|
|
; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fadd_v2_v_fneg_lo:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_lo:[0,1]
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fadd_v2_v_fneg_lo:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v3, s2
|
|
; GFX1250-GISEL-NEXT: v_max_num_f32_e64 v2, -s2, -s2
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%fneg = fsub float -0.0, %x
|
|
%tmp1 = insertelement <2 x float> poison, float %fneg, i64 0
|
|
%k = insertelement <2 x float> %tmp1, float %x, i64 1
|
|
%add = fadd <2 x float> %load, %k
|
|
store <2 x float> %add, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fadd_v2_v_fneg_hi(ptr addrspace(1) %a, float %x) {
|
|
; GFX900-LABEL: fadd_v2_v_fneg_hi:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: s_load_dword s2, s[4:5], 0x2c
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_subrev_f32_e32 v1, s2, v1
|
|
; GFX900-NEXT: v_add_f32_e32 v0, s2, v0
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fadd_v2_v_fneg_hi:
|
|
; PACKED-SDAG: ; %bb.0:
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c
|
|
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_hi:[0,1]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: fadd_v2_v_fneg_hi:
|
|
; PACKED-GISEL: ; %bb.0:
|
|
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
|
|
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
|
|
; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s2
|
|
; PACKED-GISEL-NEXT: v_max_f32_e64 v3, -s2, -s2
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
|
|
; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fadd_v2_v_fneg_hi:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_hi:[0,1]
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fadd_v2_v_fneg_hi:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX1250-GISEL-NEXT: v_max_num_f32_e64 v3, -s2, -s2
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%fneg = fsub float -0.0, %x
|
|
%tmp1 = insertelement <2 x float> poison, float %x, i64 0
|
|
%k = insertelement <2 x float> %tmp1, float %fneg, i64 1
|
|
%add = fadd <2 x float> %load, %k
|
|
store <2 x float> %add, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fadd_v2_v_fneg_lo2(ptr addrspace(1) %a, float %x, float %y) {
|
|
; GFX900-LABEL: fadd_v2_v_fneg_lo2:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_add_f32_e32 v1, s3, v1
|
|
; GFX900-NEXT: v_subrev_f32_e32 v0, s2, v0
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fadd_v2_v_fneg_lo2:
|
|
; PACKED-SDAG: ; %bb.0:
|
|
; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] neg_lo:[0,1]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: fadd_v2_v_fneg_lo2:
|
|
; PACKED-GISEL: ; %bb.0:
|
|
; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
|
|
; PACKED-GISEL-NEXT: v_max_f32_e64 v2, -s2, -s2
|
|
; PACKED-GISEL-NEXT: v_mov_b32_e32 v3, s3
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
|
|
; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fadd_v2_v_fneg_lo2:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] neg_lo:[0,1]
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fadd_v2_v_fneg_lo2:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: v_max_num_f32_e64 v2, -s2, -s2
|
|
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%fneg = fsub float -0.0, %x
|
|
%tmp1 = insertelement <2 x float> poison, float %fneg, i64 0
|
|
%k = insertelement <2 x float> %tmp1, float %y, i64 1
|
|
%add = fadd <2 x float> %load, %k
|
|
store <2 x float> %add, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fadd_v2_v_fneg_hi2(ptr addrspace(1) %a, float %x, float %y) {
|
|
; GFX900-LABEL: fadd_v2_v_fneg_hi2:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_subrev_f32_e32 v1, s2, v1
|
|
; GFX900-NEXT: v_add_f32_e32 v0, s3, v0
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fadd_v2_v_fneg_hi2:
|
|
; PACKED-SDAG: ; %bb.0:
|
|
; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel:[0,1] op_sel_hi:[1,0] neg_hi:[0,1]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: fadd_v2_v_fneg_hi2:
|
|
; PACKED-GISEL: ; %bb.0:
|
|
; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
|
|
; PACKED-GISEL-NEXT: v_max_f32_e64 v3, -s2, -s2
|
|
; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s3
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
|
|
; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fadd_v2_v_fneg_hi2:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0] neg_hi:[0,1]
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fadd_v2_v_fneg_hi2:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: v_max_num_f32_e64 v3, -s2, -s2
|
|
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s3
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%fneg = fsub float -0.0, %x
|
|
%tmp1 = insertelement <2 x float> poison, float %y, i64 0
|
|
%k = insertelement <2 x float> %tmp1, float %fneg, i64 1
|
|
%add = fadd <2 x float> %load, %k
|
|
store <2 x float> %add, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fmul_v2_vv(ptr addrspace(1) %a) {
|
|
; GFX900-LABEL: fmul_v2_vv:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_mul_f32_e32 v1, v1, v1
|
|
; GFX900-NEXT: v_mul_f32_e32 v0, v0, v0
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-LABEL: fmul_v2_vv:
|
|
; PACKED: ; %bb.0:
|
|
; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[0:1]
|
|
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: fmul_v2_vv:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[0:1]
|
|
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%mul = fmul <2 x float> %load, %load
|
|
store <2 x float> %mul, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fmul_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
|
|
; GFX900-LABEL: fmul_v2_vs:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_mul_f32_e32 v1, s3, v1
|
|
; GFX900-NEXT: v_mul_f32_e32 v0, s2, v0
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-LABEL: fmul_v2_vs:
|
|
; PACKED: ; %bb.0:
|
|
; PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
|
|
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: fmul_v2_vs:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1250-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
|
|
; GFX1250-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%mul = fmul <2 x float> %load, %x
|
|
store <2 x float> %mul, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fmul_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
|
|
; GFX900-LABEL: fmul_v4_vs:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_mul_f32_e32 v3, s3, v3
|
|
; GFX900-NEXT: v_mul_f32_e32 v2, s2, v2
|
|
; GFX900-NEXT: v_mul_f32_e32 v1, s1, v1
|
|
; GFX900-NEXT: v_mul_f32_e32 v0, s0, v0
|
|
; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fmul_v4_vs:
|
|
; PACKED-SDAG: ; %bb.0:
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
|
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 4, v0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[2:3]
|
|
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[0:1]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: fmul_v4_vs:
|
|
; PACKED-GISEL: ; %bb.0:
|
|
; PACKED-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
|
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 4, v0
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[0:1]
|
|
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[2:3]
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fmul_v4_vs:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_clause 0x1
|
|
; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[4:5]
|
|
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[6:7]
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fmul_v4_vs:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_clause 0x1
|
|
; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[4:5]
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[6:7]
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <4 x float>, ptr addrspace(1) %gep, align 16
|
|
%mul = fmul <4 x float> %load, %x
|
|
store <4 x float> %mul, ptr addrspace(1) %gep, align 16
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
|
; GFX900-LABEL: fmul_v32_vs:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
|
|
; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
|
|
; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
|
|
; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
|
|
; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
|
|
; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
|
|
; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
|
; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
|
; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
|
|
; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
|
|
; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
|
|
; GFX900-NEXT: v_mul_f32_e32 v4, s43, v4
|
|
; GFX900-NEXT: v_mul_f32_e32 v3, s42, v3
|
|
; GFX900-NEXT: v_mul_f32_e32 v2, s41, v2
|
|
; GFX900-NEXT: v_mul_f32_e32 v1, s40, v1
|
|
; GFX900-NEXT: s_waitcnt vmcnt(6)
|
|
; GFX900-NEXT: v_mul_f32_e32 v8, s39, v8
|
|
; GFX900-NEXT: v_mul_f32_e32 v7, s38, v7
|
|
; GFX900-NEXT: v_mul_f32_e32 v6, s37, v6
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_mul_f32_e32 v32, s19, v32
|
|
; GFX900-NEXT: v_mul_f32_e32 v31, s18, v31
|
|
; GFX900-NEXT: v_mul_f32_e32 v30, s17, v30
|
|
; GFX900-NEXT: v_mul_f32_e32 v29, s16, v29
|
|
; GFX900-NEXT: v_mul_f32_e32 v5, s36, v5
|
|
; GFX900-NEXT: v_mul_f32_e32 v12, s51, v12
|
|
; GFX900-NEXT: v_mul_f32_e32 v11, s50, v11
|
|
; GFX900-NEXT: v_mul_f32_e32 v10, s49, v10
|
|
; GFX900-NEXT: v_mul_f32_e32 v9, s48, v9
|
|
; GFX900-NEXT: v_mul_f32_e32 v16, s47, v16
|
|
; GFX900-NEXT: v_mul_f32_e32 v15, s46, v15
|
|
; GFX900-NEXT: v_mul_f32_e32 v14, s45, v14
|
|
; GFX900-NEXT: v_mul_f32_e32 v13, s44, v13
|
|
; GFX900-NEXT: v_mul_f32_e32 v20, s15, v20
|
|
; GFX900-NEXT: v_mul_f32_e32 v19, s14, v19
|
|
; GFX900-NEXT: v_mul_f32_e32 v18, s13, v18
|
|
; GFX900-NEXT: v_mul_f32_e32 v17, s12, v17
|
|
; GFX900-NEXT: v_mul_f32_e32 v24, s11, v24
|
|
; GFX900-NEXT: v_mul_f32_e32 v23, s10, v23
|
|
; GFX900-NEXT: v_mul_f32_e32 v22, s9, v22
|
|
; GFX900-NEXT: v_mul_f32_e32 v21, s8, v21
|
|
; GFX900-NEXT: v_mul_f32_e32 v28, s23, v28
|
|
; GFX900-NEXT: v_mul_f32_e32 v27, s22, v27
|
|
; GFX900-NEXT: v_mul_f32_e32 v26, s21, v26
|
|
; GFX900-NEXT: v_mul_f32_e32 v25, s20, v25
|
|
; GFX900-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:96
|
|
; GFX900-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:112
|
|
; GFX900-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:64
|
|
; GFX900-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80
|
|
; GFX900-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:32
|
|
; GFX900-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:48
|
|
; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1]
|
|
; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:16
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fmul_v32_vs:
|
|
; PACKED-SDAG: ; %bb.0:
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:32
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
|
|
; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
|
; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[40:41]
|
|
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[42:43]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6)
|
|
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[6:7], v[6:7], s[38:39]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(5)
|
|
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[8:9], v[8:9], s[48:49]
|
|
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[10:11], v[10:11], s[50:51]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(4)
|
|
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[16:17], v[16:17], s[44:45]
|
|
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[18:19], v[18:19], s[46:47]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[28:29], v[28:29], s[16:17]
|
|
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[30:31], v[30:31], s[18:19]
|
|
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[20:21], v[20:21], s[12:13]
|
|
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[22:23], v[22:23], s[14:15]
|
|
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[14:15], v[14:15], s[10:11]
|
|
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[24:25], v[24:25], s[20:21]
|
|
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[26:27], v[26:27], s[22:23]
|
|
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[4:5], v[4:5], s[36:37]
|
|
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[12:13], v[12:13], s[8:9]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:64
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: fmul_v32_vs:
|
|
; PACKED-GISEL: ; %bb.0:
|
|
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
|
|
; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
|
; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[36:37]
|
|
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[38:39]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6)
|
|
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[4:5], v[4:5], s[40:41]
|
|
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[6:7], v[6:7], s[42:43]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(5)
|
|
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[8:9], v[8:9], s[44:45]
|
|
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[10:11], v[10:11], s[46:47]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(4)
|
|
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[12:13], v[12:13], s[48:49]
|
|
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[14:15], v[14:15], s[50:51]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(3)
|
|
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[16:17], v[16:17], s[8:9]
|
|
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[18:19], v[18:19], s[10:11]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(2)
|
|
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[20:21], v[20:21], s[12:13]
|
|
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[22:23], v[22:23], s[14:15]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(1)
|
|
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[24:25], v[24:25], s[16:17]
|
|
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[26:27], v[26:27], s[18:19]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[28:29], v[28:29], s[20:21]
|
|
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[30:31], v[30:31], s[22:23]
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fmul_v32_vs:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: s_clause 0x7
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[34:35] offset:16
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:48
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[34:35]
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:80
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:96
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:64
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
|
|
; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
|
|
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v33, s21
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s22 :: v_dual_mov_b32 v35, s23
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s18 :: v_dual_mov_b32 v39, s29
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s30 :: v_dual_mov_b32 v41, s31
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s24 :: v_dual_mov_b32 v37, s19
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s28 :: v_dual_mov_b32 v55, s15
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s3 :: v_dual_mov_b32 v52, s12
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s13 :: v_dual_mov_b32 v54, s14
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s7 :: v_dual_mov_b32 v50, s2
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s27 :: v_dual_mov_b32 v46, s4
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s5 :: v_dual_mov_b32 v48, s6
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s25 :: v_dual_mov_b32 v44, s26
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
|
|
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33]
|
|
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35]
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s8 :: v_dual_mov_b32 v33, s9
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s10 :: v_dual_mov_b32 v35, s11
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6
|
|
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[40:41]
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[0:1]
|
|
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[38:39]
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[16:17]
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2
|
|
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[32:33]
|
|
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[34:35]
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[52:53]
|
|
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[54:55]
|
|
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[50:51]
|
|
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[24:25], v[24:25], v[40:41]
|
|
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[46:47]
|
|
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[48:49]
|
|
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[42:43]
|
|
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[44:45]
|
|
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[36:37]
|
|
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[38:39]
|
|
; GFX1250-SDAG-NEXT: s_clause 0x7
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:96
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:64
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:80
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:48
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[34:35]
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[34:35] offset:16
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fmul_v32_vs:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: s_clause 0x7
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35]
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
|
|
; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
|
|
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33]
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[36:37]
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[38:39]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[40:41]
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[42:43]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x4
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[44:45]
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[46:47]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x3
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[48:49]
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[50:51]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x2
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[52:53]
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[54:55]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x1
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[24:25], v[24:25], v[32:33]
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[34:35]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[36:37]
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[38:39]
|
|
; GFX1250-GISEL-NEXT: s_clause 0x7
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35]
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[34:35] offset:48
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:64
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:80
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:96
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <32 x float>, ptr addrspace(1) %gep, align 128
|
|
%mul = fmul <32 x float> %load, %x
|
|
store <32 x float> %mul, ptr addrspace(1) %gep, align 128
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fmul_v2_v_imm(ptr addrspace(1) %a) {
|
|
; GFX900-LABEL: fmul_v2_v_imm:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_mul_f32_e32 v1, 0x42c80000, v1
|
|
; GFX900-NEXT: v_mul_f32_e32 v0, 0x42c80000, v0
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fmul_v2_v_imm:
|
|
; PACKED-SDAG: ; %bb.0:
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-SDAG-NEXT: s_mov_b32 s2, 0x42c80000
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: fmul_v2_v_imm:
|
|
; PACKED-GISEL: ; %bb.0:
|
|
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-GISEL-NEXT: s_mov_b32 s2, 0x42c80000
|
|
; PACKED-GISEL-NEXT: s_mov_b32 s3, s2
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
|
|
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fmul_v2_v_imm:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: s_mov_b32 s2, 0x42c80000
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0]
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fmul_v2_v_imm:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0x42c80000
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%mul = fmul <2 x float> %load, <float 100.0, float 100.0>
|
|
store <2 x float> %mul, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) {
|
|
; GFX900-LABEL: fmul_v2_v_v_splat:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v3, 3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_mul_f32_e32 v2, v2, v0
|
|
; GFX900-NEXT: v_mul_f32_e32 v1, v1, v0
|
|
; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fmul_v2_v_v_splat:
|
|
; PACKED-SDAG: ; %bb.0:
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[2:3], v[0:1] op_sel_hi:[1,0]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: fmul_v2_v_v_splat:
|
|
; PACKED-GISEL: ; %bb.0:
|
|
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
|
; PACKED-GISEL-NEXT: v_mov_b32_e32 v1, v0
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[2:3], v[0:1]
|
|
; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fmul_v2_v_v_splat:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: global_load_b64 v[2:3], v0, s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[0:1] op_sel_hi:[1,0]
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fmul_v2_v_v_splat:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: global_load_b64 v[2:3], v0, s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[0:1]
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%fid = bitcast i32 %id to float
|
|
%tmp1 = insertelement <2 x float> poison, float %fid, i64 0
|
|
%k = insertelement <2 x float> %tmp1, float %fid, i64 1
|
|
%mul = fmul <2 x float> %load, %k
|
|
store <2 x float> %mul, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) {
|
|
; GFX900-LABEL: fmul_v2_v_lit_splat:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_mul_f32_e32 v1, 4.0, v1
|
|
; GFX900-NEXT: v_mul_f32_e32 v0, 4.0, v0
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fmul_v2_v_lit_splat:
|
|
; PACKED-SDAG: ; %bb.0:
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], 4.0 op_sel_hi:[1,0]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: fmul_v2_v_lit_splat:
|
|
; PACKED-GISEL: ; %bb.0:
|
|
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-GISEL-NEXT: s_mov_b32 s2, 4.0
|
|
; PACKED-GISEL-NEXT: s_mov_b32 s3, s2
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
|
|
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fmul_v2_v_lit_splat:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], 4.0 op_sel_hi:[1,0]
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fmul_v2_v_lit_splat:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_mov_b32 s2, 4.0
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%mul = fmul <2 x float> %load, <float 4.0, float 4.0>
|
|
store <2 x float> %mul, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
|
|
; GFX900-LABEL: fmul_v2_v_unfoldable_lit:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_mul_f32_e32 v1, 0x40400000, v1
|
|
; GFX900-NEXT: v_mul_f32_e32 v0, 4.0, v0
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-LABEL: fmul_v2_v_unfoldable_lit:
|
|
; PACKED: ; %bb.0:
|
|
; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-NEXT: s_mov_b32 s2, 4.0
|
|
; PACKED-NEXT: s_mov_b32 s3, 0x40400000
|
|
; PACKED-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
|
|
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fmul_v2_v_unfoldable_lit:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], lit64(0x4040000040800000)
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fmul_v2_v_unfoldable_lit:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], lit64(0x4040000040800000)
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%mul = fmul <2 x float> %load, <float 4.0, float 3.0>
|
|
store <2 x float> %mul, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fmul_v2_v_fneg(ptr addrspace(1) %a, float %x) {
|
|
; GFX900-LABEL: fmul_v2_v_fneg:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: s_load_dword s2, s[4:5], 0x2c
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_mul_f32_e64 v1, v1, -s2
|
|
; GFX900-NEXT: v_mul_f32_e64 v0, v0, -s2
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fmul_v2_v_fneg:
|
|
; PACKED-SDAG: ; %bb.0:
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c
|
|
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: fmul_v2_v_fneg:
|
|
; PACKED-GISEL: ; %bb.0:
|
|
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
|
|
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
|
|
; PACKED-GISEL-NEXT: v_max_f32_e64 v2, -s2, -s2
|
|
; PACKED-GISEL-NEXT: v_mov_b32_e32 v3, v2
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
|
|
; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fmul_v2_v_fneg:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fmul_v2_v_fneg:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: v_max_num_f32_e64 v2, -s2, -s2
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%fneg = fsub float -0.0, %x
|
|
%tmp1 = insertelement <2 x float> poison, float %fneg, i64 0
|
|
%k = insertelement <2 x float> %tmp1, float %fneg, i64 1
|
|
%mul = fmul <2 x float> %load, %k
|
|
store <2 x float> %mul, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fma_v2_vv(ptr addrspace(1) %a) {
|
|
; GFX900-LABEL: fma_v2_vv:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_fma_f32 v1, v1, v1, v1
|
|
; GFX900-NEXT: v_fma_f32 v0, v0, v0, v0
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-LABEL: fma_v2_vv:
|
|
; PACKED: ; %bb.0:
|
|
; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[0:1], v[0:1]
|
|
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: fma_v2_vv:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[0:1], v[0:1]
|
|
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %load, <2 x float> %load)
|
|
store <2 x float> %fma, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fma_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
|
|
; GFX900-LABEL: fma_v2_vs:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_fma_f32 v1, v1, s3, s3
|
|
; GFX900-NEXT: v_fma_f32 v0, v0, s2, s2
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-LABEL: fma_v2_vs:
|
|
; PACKED: ; %bb.0:
|
|
; PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[2:3]
|
|
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: fma_v2_vs:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3]
|
|
; GFX1250-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %x, <2 x float> %x)
|
|
store <2 x float> %fma, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fma_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
|
|
; GFX900-LABEL: fma_v4_vs:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_fma_f32 v3, v3, s3, s3
|
|
; GFX900-NEXT: v_fma_f32 v2, v2, s2, s2
|
|
; GFX900-NEXT: v_fma_f32 v1, v1, s1, s1
|
|
; GFX900-NEXT: v_fma_f32 v0, v0, s0, s0
|
|
; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fma_v4_vs:
|
|
; PACKED-SDAG: ; %bb.0:
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
|
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 4, v0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[2:3], s[2:3]
|
|
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[0:1], s[0:1]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: fma_v4_vs:
|
|
; PACKED-GISEL: ; %bb.0:
|
|
; PACKED-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
|
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 4, v0
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[0:1], s[0:1]
|
|
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[2:3], s[2:3]
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fma_v4_vs:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_clause 0x1
|
|
; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[4:5], v[4:5]
|
|
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[6:7], v[6:7]
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fma_v4_vs:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_clause 0x1
|
|
; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[4:5], v[4:5]
|
|
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[6:7], v[6:7]
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <4 x float>, ptr addrspace(1) %gep, align 16
|
|
%fma = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %load, <4 x float> %x, <4 x float> %x)
|
|
store <4 x float> %fma, ptr addrspace(1) %gep, align 16
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
|
|
; GFX900-LABEL: fma_v32_vs:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
|
|
; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
|
|
; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
|
|
; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
|
|
; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
|
|
; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
|
|
; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
|
; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
|
; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
|
|
; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
|
|
; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
|
|
; GFX900-NEXT: v_fma_f32 v4, v4, s43, s43
|
|
; GFX900-NEXT: v_fma_f32 v3, v3, s42, s42
|
|
; GFX900-NEXT: v_fma_f32 v2, v2, s41, s41
|
|
; GFX900-NEXT: v_fma_f32 v1, v1, s40, s40
|
|
; GFX900-NEXT: s_waitcnt vmcnt(6)
|
|
; GFX900-NEXT: v_fma_f32 v8, v8, s39, s39
|
|
; GFX900-NEXT: v_fma_f32 v7, v7, s38, s38
|
|
; GFX900-NEXT: v_fma_f32 v6, v6, s37, s37
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_fma_f32 v32, v32, s19, s19
|
|
; GFX900-NEXT: v_fma_f32 v31, v31, s18, s18
|
|
; GFX900-NEXT: v_fma_f32 v30, v30, s17, s17
|
|
; GFX900-NEXT: v_fma_f32 v29, v29, s16, s16
|
|
; GFX900-NEXT: v_fma_f32 v5, v5, s36, s36
|
|
; GFX900-NEXT: v_fma_f32 v12, v12, s51, s51
|
|
; GFX900-NEXT: v_fma_f32 v11, v11, s50, s50
|
|
; GFX900-NEXT: v_fma_f32 v10, v10, s49, s49
|
|
; GFX900-NEXT: v_fma_f32 v9, v9, s48, s48
|
|
; GFX900-NEXT: v_fma_f32 v16, v16, s47, s47
|
|
; GFX900-NEXT: v_fma_f32 v15, v15, s46, s46
|
|
; GFX900-NEXT: v_fma_f32 v14, v14, s45, s45
|
|
; GFX900-NEXT: v_fma_f32 v13, v13, s44, s44
|
|
; GFX900-NEXT: v_fma_f32 v20, v20, s15, s15
|
|
; GFX900-NEXT: v_fma_f32 v19, v19, s14, s14
|
|
; GFX900-NEXT: v_fma_f32 v18, v18, s13, s13
|
|
; GFX900-NEXT: v_fma_f32 v17, v17, s12, s12
|
|
; GFX900-NEXT: v_fma_f32 v24, v24, s11, s11
|
|
; GFX900-NEXT: v_fma_f32 v23, v23, s10, s10
|
|
; GFX900-NEXT: v_fma_f32 v22, v22, s9, s9
|
|
; GFX900-NEXT: v_fma_f32 v21, v21, s8, s8
|
|
; GFX900-NEXT: v_fma_f32 v28, v28, s23, s23
|
|
; GFX900-NEXT: v_fma_f32 v27, v27, s22, s22
|
|
; GFX900-NEXT: v_fma_f32 v26, v26, s21, s21
|
|
; GFX900-NEXT: v_fma_f32 v25, v25, s20, s20
|
|
; GFX900-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:96
|
|
; GFX900-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:112
|
|
; GFX900-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:64
|
|
; GFX900-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80
|
|
; GFX900-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:32
|
|
; GFX900-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:48
|
|
; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1]
|
|
; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:16
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fma_v32_vs:
|
|
; PACKED-SDAG: ; %bb.0:
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:32
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
|
|
; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
|
; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[40:41], s[40:41]
|
|
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[42:43], s[42:43]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6)
|
|
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[6:7], v[6:7], s[38:39], s[38:39]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(5)
|
|
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[8:9], v[8:9], s[48:49], s[48:49]
|
|
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[10:11], v[10:11], s[50:51], s[50:51]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(4)
|
|
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[16:17], v[16:17], s[44:45], s[44:45]
|
|
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[18:19], v[18:19], s[46:47], s[46:47]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[28:29], v[28:29], s[16:17], s[16:17]
|
|
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[30:31], v[30:31], s[18:19], s[18:19]
|
|
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[20:21], v[20:21], s[12:13], s[12:13]
|
|
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[22:23], v[22:23], s[14:15], s[14:15]
|
|
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[14:15], v[14:15], s[10:11], s[10:11]
|
|
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[24:25], v[24:25], s[20:21], s[20:21]
|
|
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[26:27], v[26:27], s[22:23], s[22:23]
|
|
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[4:5], v[4:5], s[36:37], s[36:37]
|
|
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[12:13], v[12:13], s[8:9], s[8:9]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:64
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: fma_v32_vs:
|
|
; PACKED-GISEL: ; %bb.0:
|
|
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
|
|
; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
|
|
; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[36:37], s[36:37]
|
|
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[38:39], s[38:39]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6)
|
|
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[4:5], v[4:5], s[40:41], s[40:41]
|
|
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[6:7], v[6:7], s[42:43], s[42:43]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(5)
|
|
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[8:9], v[8:9], s[44:45], s[44:45]
|
|
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[10:11], v[10:11], s[46:47], s[46:47]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(4)
|
|
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[12:13], v[12:13], s[48:49], s[48:49]
|
|
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[14:15], v[14:15], s[50:51], s[50:51]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(3)
|
|
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[16:17], v[16:17], s[8:9], s[8:9]
|
|
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[18:19], v[18:19], s[10:11], s[10:11]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(2)
|
|
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[20:21], v[20:21], s[12:13], s[12:13]
|
|
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[22:23], v[22:23], s[14:15], s[14:15]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(1)
|
|
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[24:25], v[24:25], s[16:17], s[16:17]
|
|
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[26:27], v[26:27], s[18:19], s[18:19]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[28:29], v[28:29], s[20:21], s[20:21]
|
|
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[30:31], v[30:31], s[22:23], s[22:23]
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fma_v32_vs:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: s_clause 0x7
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[34:35] offset:16
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:48
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[34:35]
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:80
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:96
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:64
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
|
|
; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
|
|
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[20:21]
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[22:23]
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[30:31]
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[28:29]
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[52:53], s[12:13]
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[54:55], s[14:15]
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[50:51], s[2:3]
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[46:47], s[4:5]
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[48:49], s[6:7]
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[42:43], s[24:25]
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[44:45], s[26:27]
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[18:19]
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
|
|
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[32:33], v[32:33]
|
|
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[34:35], v[34:35]
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[8:9]
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[10:11]
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6
|
|
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[6:7], v[6:7], v[40:41], v[40:41]
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[0:1]
|
|
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[4:5], v[4:5], v[38:39], v[38:39]
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[16:17]
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[28:29], v[28:29], v[52:53], v[52:53]
|
|
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[20:21], v[20:21], v[32:33], v[32:33]
|
|
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[22:23], v[22:23], v[34:35], v[34:35]
|
|
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[30:31], v[30:31], v[54:55], v[54:55]
|
|
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[26:27], v[26:27], v[50:51], v[50:51]
|
|
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[24:25], v[24:25], v[40:41], v[40:41]
|
|
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[16:17], v[16:17], v[46:47], v[46:47]
|
|
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[18:19], v[18:19], v[48:49], v[48:49]
|
|
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[8:9], v[8:9], v[42:43], v[42:43]
|
|
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[10:11], v[10:11], v[44:45], v[44:45]
|
|
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[14:15], v[14:15], v[36:37], v[36:37]
|
|
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[12:13], v[12:13], v[38:39], v[38:39]
|
|
; GFX1250-SDAG-NEXT: s_clause 0x7
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:96
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:64
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:80
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:48
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[34:35]
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[34:35] offset:16
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fma_v32_vs:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: s_clause 0x7
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35]
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
|
|
; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
|
|
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7
|
|
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[32:33], v[32:33]
|
|
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[34:35], v[34:35]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6
|
|
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[4:5], v[4:5], v[36:37], v[36:37]
|
|
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[6:7], v[6:7], v[38:39], v[38:39]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5
|
|
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[8:9], v[8:9], v[40:41], v[40:41]
|
|
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[10:11], v[10:11], v[42:43], v[42:43]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x4
|
|
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[12:13], v[12:13], v[44:45], v[44:45]
|
|
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[14:15], v[14:15], v[46:47], v[46:47]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x3
|
|
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[16:17], v[16:17], v[48:49], v[48:49]
|
|
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[18:19], v[18:19], v[50:51], v[50:51]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x2
|
|
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[20:21], v[20:21], v[52:53], v[52:53]
|
|
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[22:23], v[22:23], v[54:55], v[54:55]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x1
|
|
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[24:25], v[24:25], v[32:33], v[32:33]
|
|
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[26:27], v[26:27], v[34:35], v[34:35]
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[28:29], v[28:29], v[36:37], v[36:37]
|
|
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[30:31], v[30:31], v[38:39], v[38:39]
|
|
; GFX1250-GISEL-NEXT: s_clause 0x7
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35]
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[34:35] offset:48
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:64
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:80
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:96
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <32 x float>, ptr addrspace(1) %gep, align 128
|
|
%fma = tail call <32 x float> @llvm.fma.v32f32(<32 x float> %load, <32 x float> %x, <32 x float> %x)
|
|
store <32 x float> %fma, ptr addrspace(1) %gep, align 128
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) {
|
|
; GFX900-LABEL: fma_v2_v_imm:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX900-NEXT: s_mov_b32 s2, 0x42c80000
|
|
; GFX900-NEXT: v_mov_b32_e32 v3, 0x43480000
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_fma_f32 v1, v1, s2, v3
|
|
; GFX900-NEXT: v_fma_f32 v0, v0, s2, v3
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fma_v2_v_imm:
|
|
; PACKED-SDAG: ; %bb.0:
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v3, 3, v0
|
|
; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, 0x43480000
|
|
; PACKED-SDAG-NEXT: s_mov_b32 s2, 0x42c80000
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] op_sel_hi:[1,0,0]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-GISEL-LABEL: fma_v2_v_imm:
|
|
; GFX90A-GISEL: ; %bb.0:
|
|
; GFX90A-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX90A-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX90A-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
|
; GFX90A-GISEL-NEXT: s_mov_b32 s4, 0x43480000
|
|
; GFX90A-GISEL-NEXT: s_mov_b32 s2, 0x42c80000
|
|
; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
|
|
; GFX90A-GISEL-NEXT: s_mov_b32 s5, s4
|
|
; GFX90A-GISEL-NEXT: s_mov_b32 s3, s2
|
|
; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
|
|
; GFX90A-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
|
; GFX90A-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX942-GISEL-LABEL: fma_v2_v_imm:
|
|
; GFX942-GISEL: ; %bb.0:
|
|
; GFX942-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
|
; GFX942-GISEL-NEXT: s_mov_b32 s4, 0x43480000
|
|
; GFX942-GISEL-NEXT: s_mov_b32 s2, 0x42c80000
|
|
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
|
|
; GFX942-GISEL-NEXT: s_mov_b32 s5, s4
|
|
; GFX942-GISEL-NEXT: s_mov_b32 s3, s2
|
|
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
|
|
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
|
|
; GFX942-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
|
; GFX942-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fma_v2_v_imm:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: s_mov_b32 s2, 0x43480000
|
|
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-SDAG-NEXT: s_mov_b32 s4, 0x42c80000
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[4:5], s[2:3] op_sel_hi:[1,0,0]
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fma_v2_v_imm:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v6, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0x42c80000
|
|
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0x43480000
|
|
; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2
|
|
; GFX1250-GISEL-NEXT: s_mov_b32 s5, s4
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v6, s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5]
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v6, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> <float 100.0, float 100.0>, <2 x float> <float 200.0, float 200.0>)
|
|
store <2 x float> %fma, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fma_v2_v_v_splat(ptr addrspace(1) %a) {
|
|
; GFX900-LABEL: fma_v2_v_v_splat:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v3, 3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_fma_f32 v2, v2, v0, v0
|
|
; GFX900-NEXT: v_fma_f32 v1, v1, v0, v0
|
|
; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fma_v2_v_v_splat:
|
|
; PACKED-SDAG: ; %bb.0:
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[0:1], v[0:1] op_sel_hi:[1,0,0]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: fma_v2_v_v_splat:
|
|
; PACKED-GISEL: ; %bb.0:
|
|
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
|
; PACKED-GISEL-NEXT: v_mov_b32_e32 v1, v0
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[0:1], v[0:1]
|
|
; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fma_v2_v_v_splat:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: global_load_b64 v[2:3], v0, s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[0:1], v[0:1] op_sel_hi:[1,0,0]
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fma_v2_v_v_splat:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: global_load_b64 v[2:3], v0, s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[0:1], v[0:1]
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%fid = bitcast i32 %id to float
|
|
%tmp1 = insertelement <2 x float> poison, float %fid, i64 0
|
|
%k = insertelement <2 x float> %tmp1, float %fid, i64 1
|
|
%fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %k, <2 x float> %k)
|
|
store <2 x float> %fma, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) {
|
|
; GFX900-LABEL: fma_v2_v_lit_splat:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_fma_f32 v1, v1, 4.0, 1.0
|
|
; GFX900-NEXT: v_fma_f32 v0, v0, 4.0, 1.0
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fma_v2_v_lit_splat:
|
|
; PACKED-SDAG: ; %bb.0:
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], 4.0, 1.0 op_sel_hi:[1,0,0]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-GISEL-LABEL: fma_v2_v_lit_splat:
|
|
; GFX90A-GISEL: ; %bb.0:
|
|
; GFX90A-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX90A-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX90A-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
|
; GFX90A-GISEL-NEXT: s_mov_b32 s4, 1.0
|
|
; GFX90A-GISEL-NEXT: s_mov_b32 s2, 4.0
|
|
; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
|
|
; GFX90A-GISEL-NEXT: s_mov_b32 s5, s4
|
|
; GFX90A-GISEL-NEXT: s_mov_b32 s3, s2
|
|
; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
|
|
; GFX90A-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
|
; GFX90A-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX942-GISEL-LABEL: fma_v2_v_lit_splat:
|
|
; GFX942-GISEL: ; %bb.0:
|
|
; GFX942-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
|
; GFX942-GISEL-NEXT: s_mov_b32 s4, 1.0
|
|
; GFX942-GISEL-NEXT: s_mov_b32 s2, 4.0
|
|
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
|
|
; GFX942-GISEL-NEXT: s_mov_b32 s5, s4
|
|
; GFX942-GISEL-NEXT: s_mov_b32 s3, s2
|
|
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
|
|
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
|
|
; GFX942-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
|
; GFX942-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fma_v2_v_lit_splat:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], 4.0, 1.0 op_sel_hi:[1,0,0]
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fma_v2_v_lit_splat:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v6, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_mov_b32 s2, 4.0
|
|
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1.0
|
|
; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2
|
|
; GFX1250-GISEL-NEXT: s_mov_b32 s5, s4
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v6, s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5]
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v6, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> <float 4.0, float 4.0>, <2 x float> <float 1.0, float 1.0>)
|
|
store <2 x float> %fma, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
|
|
; GFX900-LABEL: fma_v2_v_unfoldable_lit:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX900-NEXT: s_mov_b32 s2, 0x40400000
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_fma_f32 v1, v1, s2, 2.0
|
|
; GFX900-NEXT: v_fma_f32 v0, v0, 4.0, 1.0
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fma_v2_v_unfoldable_lit:
|
|
; PACKED-SDAG: ; %bb.0:
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
|
; PACKED-SDAG-NEXT: s_mov_b32 s2, 4.0
|
|
; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, 1.0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
|
|
; PACKED-SDAG-NEXT: v_mov_b32_e32 v3, 2.0
|
|
; PACKED-SDAG-NEXT: s_mov_b32 s3, 0x40400000
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-GISEL-LABEL: fma_v2_v_unfoldable_lit:
|
|
; GFX90A-GISEL: ; %bb.0:
|
|
; GFX90A-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX90A-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX90A-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
|
; GFX90A-GISEL-NEXT: s_mov_b32 s4, 1.0
|
|
; GFX90A-GISEL-NEXT: s_mov_b32 s2, 4.0
|
|
; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
|
|
; GFX90A-GISEL-NEXT: s_mov_b32 s5, 2.0
|
|
; GFX90A-GISEL-NEXT: s_mov_b32 s3, 0x40400000
|
|
; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
|
|
; GFX90A-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
|
; GFX90A-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX942-GISEL-LABEL: fma_v2_v_unfoldable_lit:
|
|
; GFX942-GISEL: ; %bb.0:
|
|
; GFX942-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
|
; GFX942-GISEL-NEXT: s_mov_b32 s4, 1.0
|
|
; GFX942-GISEL-NEXT: s_mov_b32 s2, 4.0
|
|
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
|
|
; GFX942-GISEL-NEXT: s_mov_b32 s5, 2.0
|
|
; GFX942-GISEL-NEXT: s_mov_b32 s3, 0x40400000
|
|
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
|
|
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
|
|
; GFX942-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
|
; GFX942-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fma_v2_v_unfoldable_lit:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v6, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], lit64(0x4040000040800000)
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[4:5], lit64(0x400000003f800000)
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v6, s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5]
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v6, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fma_v2_v_unfoldable_lit:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v6, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], lit64(0x4040000040800000)
|
|
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-GISEL-NEXT: s_mov_b64 s[4:5], lit64(0x400000003f800000)
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v6, s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5]
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v6, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> <float 4.0, float 3.0>, <2 x float> <float 1.0, float 2.0>)
|
|
store <2 x float> %fma, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fma_v2_v_fneg(ptr addrspace(1) %a, float %x) {
|
|
; GFX900-LABEL: fma_v2_v_fneg:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: s_load_dword s2, s[4:5], 0x2c
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_fma_f32 v1, v1, -s2, -s2
|
|
; GFX900-NEXT: v_fma_f32 v0, v0, -s2, -s2
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fma_v2_v_fneg:
|
|
; PACKED-SDAG: ; %bb.0:
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c
|
|
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[2:3] op_sel_hi:[1,0,0] neg_lo:[0,1,1] neg_hi:[0,1,1]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: fma_v2_v_fneg:
|
|
; PACKED-GISEL: ; %bb.0:
|
|
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
|
|
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
|
|
; PACKED-GISEL-NEXT: v_max_f32_e64 v2, -s2, -s2
|
|
; PACKED-GISEL-NEXT: v_mov_b32_e32 v3, v2
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3]
|
|
; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fma_v2_v_fneg:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[2:3] op_sel_hi:[1,0,0] neg_lo:[0,1,1] neg_hi:[0,1,1]
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fma_v2_v_fneg:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: v_max_num_f32_e64 v2, -s2, -s2
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3]
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%fneg = fsub float -0.0, %x
|
|
%tmp1 = insertelement <2 x float> poison, float %fneg, i64 0
|
|
%k = insertelement <2 x float> %tmp1, float %fneg, i64 1
|
|
%fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %k, <2 x float> %k)
|
|
store <2 x float> %fma, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) {
|
|
; GFX900-LABEL: add_vector_neg_bitcast_scalar_lo:
|
|
; GFX900: ; %bb.0: ; %bb
|
|
; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX900-NEXT: v_mov_b32_e32 v3, 0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX900-NEXT: v_mov_b32_e32 v2, s3
|
|
; GFX900-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
|
|
; GFX900-NEXT: ds_read_b32 v2, v2
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: v_sub_f32_e32 v1, v1, v2
|
|
; GFX900-NEXT: v_sub_f32_e32 v0, v0, v2
|
|
; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: add_vector_neg_bitcast_scalar_lo:
|
|
; PACKED-SDAG: ; %bb.0: ; %bb
|
|
; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: v_mov_b32_e32 v3, 0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s2
|
|
; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s3
|
|
; PACKED-SDAG-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
|
|
; PACKED-SDAG-NEXT: ds_read_b32 v2, v2
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: add_vector_neg_bitcast_scalar_lo:
|
|
; PACKED-GISEL: ; %bb.0: ; %bb
|
|
; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s2
|
|
; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s3
|
|
; PACKED-GISEL-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
|
|
; PACKED-GISEL-NEXT: ds_read_b32 v2, v2
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_max_f32_e64 v2, -v2, -v2
|
|
; PACKED-GISEL-NEXT: v_mov_b32_e32 v3, v2
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
|
|
; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0
|
|
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: add_vector_neg_bitcast_scalar_lo:
|
|
; GFX1250-SDAG: ; %bb.0: ; %bb
|
|
; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2
|
|
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, s3
|
|
; GFX1250-SDAG-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1
|
|
; GFX1250-SDAG-NEXT: ds_load_b32 v2, v2
|
|
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1]
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: add_vector_neg_bitcast_scalar_lo:
|
|
; GFX1250-GISEL: ; %bb.0: ; %bb
|
|
; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s3
|
|
; GFX1250-GISEL-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1
|
|
; GFX1250-GISEL-NEXT: ds_load_b32 v2, v2
|
|
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_max_num_f32_e64 v2, -v2, -v2
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
|
|
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
bb:
|
|
%vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 4
|
|
%scalar0 = load volatile float, ptr addrspace(3) %arg2, align 4
|
|
%neg.scalar0 = fsub float -0.0, %scalar0
|
|
|
|
%neg.scalar0.vec = insertelement <2 x float> poison, float %neg.scalar0, i32 0
|
|
%neg.scalar0.broadcast = shufflevector <2 x float> %neg.scalar0.vec, <2 x float> poison, <2 x i32> zeroinitializer
|
|
|
|
%result = fadd <2 x float> %vec0, %neg.scalar0.broadcast
|
|
store <2 x float> %result, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) {
|
|
; GFX900-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi:
|
|
; GFX900: ; %bb.0: ; %bb
|
|
; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX900-NEXT: v_mov_b32_e32 v6, 0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX900-NEXT: v_mov_b32_e32 v4, s3
|
|
; GFX900-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
|
|
; GFX900-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
|
|
; GFX900-NEXT: ds_read_b32 v5, v4
|
|
; GFX900-NEXT: ds_read_b32 v4, v4 offset:8
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX900-NEXT: v_fma_f32 v0, v0, v2, -v5
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: v_fma_f32 v1, v1, v3, -v4
|
|
; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi:
|
|
; PACKED-SDAG: ; %bb.0: ; %bb
|
|
; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: v_mov_b32_e32 v6, 0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s2
|
|
; PACKED-SDAG-NEXT: v_mov_b32_e32 v5, s3
|
|
; PACKED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
|
|
; PACKED-SDAG-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
|
|
; PACKED-SDAG-NEXT: ds_read_b32 v4, v5
|
|
; PACKED-SDAG-NEXT: ds_read_b32 v5, v5 offset:8
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi:
|
|
; PACKED-GISEL: ; %bb.0: ; %bb
|
|
; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s2
|
|
; PACKED-GISEL-NEXT: v_mov_b32_e32 v5, s3
|
|
; PACKED-GISEL-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
|
|
; PACKED-GISEL-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
|
|
; PACKED-GISEL-NEXT: ds_read_b32 v4, v5
|
|
; PACKED-GISEL-NEXT: ds_read_b32 v5, v5 offset:8
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[4:5], 1.0, v[4:5] op_sel_hi:[0,1]
|
|
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
|
; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0
|
|
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi:
|
|
; GFX1250-SDAG: ; %bb.0: ; %bb
|
|
; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v2, s2
|
|
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v5, s3
|
|
; GFX1250-SDAG-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1
|
|
; GFX1250-SDAG-NEXT: ds_load_2addr_b32 v[2:3], v2 offset0:2 offset1:3
|
|
; GFX1250-SDAG-NEXT: ds_load_b32 v4, v5
|
|
; GFX1250-SDAG-NEXT: ds_load_b32 v5, v5 offset:8
|
|
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v6, v[0:1], s[0:1]
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi:
|
|
; GFX1250-GISEL: ; %bb.0: ; %bb
|
|
; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s3
|
|
; GFX1250-GISEL-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1
|
|
; GFX1250-GISEL-NEXT: ds_load_2addr_b32 v[2:3], v2 offset0:2 offset1:3
|
|
; GFX1250-GISEL-NEXT: ds_load_b32 v4, v5
|
|
; GFX1250-GISEL-NEXT: ds_load_b32 v5, v5 offset:8
|
|
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[4:5], 1.0, v[4:5] op_sel_hi:[0,1]
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
|
|
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
bb:
|
|
%lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1
|
|
%arg2.gep = getelementptr inbounds float, ptr addrspace(3) %arg2, i32 2
|
|
|
|
%vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 4
|
|
%vec1 = load volatile <2 x float>, ptr addrspace(3) %lds.gep1, align 4
|
|
|
|
%scalar0 = load volatile float, ptr addrspace(3) %arg2, align 4
|
|
%scalar1 = load volatile float, ptr addrspace(3) %arg2.gep, align 4
|
|
|
|
%vec.ins0 = insertelement <2 x float> poison, float %scalar0, i32 0
|
|
%vec2 = insertelement <2 x float> %vec.ins0, float %scalar1, i32 1
|
|
%neg.vec2 = fsub <2 x float> <float -0.0, float -0.0>, %vec2
|
|
|
|
%result = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %vec0, <2 x float> %vec1, <2 x float> %neg.vec2)
|
|
store <2 x float> %result, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @shuffle_add_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
|
|
; GFX900-LABEL: shuffle_add_f32:
|
|
; GFX900: ; %bb.0: ; %bb
|
|
; GFX900-NEXT: s_load_dword s0, s[4:5], 0x2c
|
|
; GFX900-NEXT: v_mov_b32_e32 v4, 0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX900-NEXT: ds_read_b64 v[0:1], v2
|
|
; GFX900-NEXT: ds_read_b64 v[2:3], v2 offset:8
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: v_add_f32_e32 v1, v1, v2
|
|
; GFX900-NEXT: v_add_f32_e32 v0, v0, v3
|
|
; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: shuffle_add_f32:
|
|
; PACKED-SDAG: ; %bb.0: ; %bb
|
|
; PACKED-SDAG-NEXT: s_load_dword s0, s[4:5], 0x2c
|
|
; PACKED-SDAG-NEXT: v_mov_b32_e32 v4, 0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s0
|
|
; PACKED-SDAG-NEXT: ds_read_b64 v[0:1], v2
|
|
; PACKED-SDAG-NEXT: ds_read_b64 v[2:3], v2 offset:8
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: shuffle_add_f32:
|
|
; PACKED-GISEL: ; %bb.0: ; %bb
|
|
; PACKED-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s0
|
|
; PACKED-GISEL-NEXT: ds_read_b64 v[0:1], v2
|
|
; PACKED-GISEL-NEXT: ds_read_b64 v[2:3], v2 offset:8
|
|
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0]
|
|
; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0
|
|
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: shuffle_add_f32:
|
|
; GFX1250-SDAG: ; %bb.0: ; %bb
|
|
; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v2, s2
|
|
; GFX1250-SDAG-NEXT: ds_load_b64 v[0:1], v2
|
|
; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v2 offset:8
|
|
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0]
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1]
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: shuffle_add_f32:
|
|
; GFX1250-GISEL: ; %bb.0: ; %bb
|
|
; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX1250-GISEL-NEXT: ds_load_b64 v[0:1], v2
|
|
; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v2 offset:8
|
|
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0]
|
|
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
bb:
|
|
%vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 8
|
|
%lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1
|
|
%vec1 = load volatile <2 x float>, ptr addrspace(3) %lds.gep1, align 8
|
|
%vec1.swap = shufflevector <2 x float> %vec1, <2 x float> poison, <2 x i32> <i32 1, i32 0>
|
|
%result = fadd <2 x float> %vec0, %vec1.swap
|
|
store <2 x float> %result, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @shuffle_neg_add_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
|
|
; GFX900-LABEL: shuffle_neg_add_f32:
|
|
; GFX900: ; %bb.0: ; %bb
|
|
; GFX900-NEXT: s_load_dword s0, s[4:5], 0x2c
|
|
; GFX900-NEXT: v_mov_b32_e32 v4, 0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX900-NEXT: ds_read_b64 v[0:1], v2
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: ds_read_b32 v3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: ds_read_b64 v[2:3], v2 offset:8
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: v_sub_f32_e32 v1, v1, v2
|
|
; GFX900-NEXT: v_sub_f32_e32 v0, v0, v3
|
|
; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: shuffle_neg_add_f32:
|
|
; PACKED-SDAG: ; %bb.0: ; %bb
|
|
; PACKED-SDAG-NEXT: s_load_dword s0, s[4:5], 0x2c
|
|
; PACKED-SDAG-NEXT: v_mov_b32_e32 v4, 0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s0
|
|
; PACKED-SDAG-NEXT: ds_read_b64 v[0:1], v2
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: ds_read_b32 v3, v0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: ds_read_b64 v[2:3], v2 offset:8
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: shuffle_neg_add_f32:
|
|
; PACKED-GISEL: ; %bb.0: ; %bb
|
|
; PACKED-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s0
|
|
; PACKED-GISEL-NEXT: ds_read_b64 v[0:1], v2
|
|
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: ds_read_b32 v3, v0
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: ds_read_b64 v[2:3], v2 offset:8
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1]
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0]
|
|
; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0
|
|
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: shuffle_neg_add_f32:
|
|
; GFX1250-SDAG: ; %bb.0: ; %bb
|
|
; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v2, s2
|
|
; GFX1250-SDAG-NEXT: ds_load_b64 v[0:1], v2
|
|
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
|
|
; GFX1250-SDAG-NEXT: ds_load_b32 v3, v0
|
|
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
|
|
; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v2 offset:8
|
|
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1]
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: shuffle_neg_add_f32:
|
|
; GFX1250-GISEL: ; %bb.0: ; %bb
|
|
; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX1250-GISEL-NEXT: ds_load_b64 v[0:1], v2
|
|
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
|
|
; GFX1250-GISEL-NEXT: ds_load_b32 v3, v0
|
|
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
|
|
; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v2 offset:8
|
|
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1]
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0]
|
|
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
bb:
|
|
%vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 8
|
|
%lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1
|
|
%f32 = load volatile float, ptr addrspace(3) poison, align 8
|
|
%vec1 = load volatile <2 x float>, ptr addrspace(3) %lds.gep1, align 8
|
|
%vec1.neg = fsub <2 x float> <float -0.0, float -0.0>, %vec1
|
|
%vec1.neg.swap = shufflevector <2 x float> %vec1.neg, <2 x float> poison, <2 x i32> <i32 1, i32 0>
|
|
%result = fadd <2 x float> %vec0, %vec1.neg.swap
|
|
store <2 x float> %result, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) {
|
|
; GFX900-LABEL: fadd_fadd_fsub_0:
|
|
; GFX900: ; %bb.0: ; %bb
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: v_add_f32_e64 v0, s1, 0
|
|
; GFX900-NEXT: v_add_f32_e32 v1, 0, v0
|
|
; GFX900-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX900-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fadd_fadd_fsub_0:
|
|
; PACKED-SDAG: ; %bb.0: ; %bb
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_add_f32_e64 v0, s1, 0
|
|
; PACKED-SDAG-NEXT: v_add_f32_e32 v1, 0, v0
|
|
; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s0
|
|
; PACKED-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-GISEL-LABEL: fadd_fadd_fsub_0:
|
|
; GFX90A-GISEL: ; %bb.0: ; %bb
|
|
; GFX90A-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], 0
|
|
; GFX90A-GISEL-NEXT: v_mov_b32_e32 v0, v1
|
|
; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0
|
|
; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX90A-GISEL-NEXT: v_mov_b32_e32 v3, v0
|
|
; GFX90A-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX90A-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX942-GISEL-LABEL: fadd_fadd_fsub_0:
|
|
; GFX942-GISEL: ; %bb.0: ; %bb
|
|
; GFX942-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], 0
|
|
; GFX942-GISEL-NEXT: s_nop 0
|
|
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, v1
|
|
; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0
|
|
; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, v0
|
|
; GFX942-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX942-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fadd_fadd_fsub_0:
|
|
; GFX1250-SDAG: ; %bb.0: ; %bb
|
|
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: s_add_f32 s1, s1, 0
|
|
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
|
|
; GFX1250-SDAG-NEXT: s_add_f32 s1, s1, 0
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
|
; GFX1250-SDAG-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fadd_fadd_fsub_0:
|
|
; GFX1250-GISEL: ; %bb.0: ; %bb
|
|
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0
|
|
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, v1
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0
|
|
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v3, v0
|
|
; GFX1250-GISEL-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_SE
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
bb:
|
|
%i12 = fadd <2 x float> zeroinitializer, %arg
|
|
%shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
|
|
%i13 = fadd <2 x float> zeroinitializer, %shift8
|
|
%i14 = shufflevector <2 x float> %arg, <2 x float> %i13, <2 x i32> <i32 0, i32 2>
|
|
%i15 = fsub <2 x float> %i14, zeroinitializer
|
|
store <2 x float> %i15, ptr poison
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, ptr addrspace(1) %ptr) {
|
|
; GFX900-LABEL: fadd_fadd_fsub:
|
|
; GFX900: ; %bb.0: ; %bb
|
|
; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX900-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: v_mov_b32_e32 v0, s3
|
|
; GFX900-NEXT: v_add_f32_e32 v0, s1, v0
|
|
; GFX900-NEXT: v_mov_b32_e32 v1, s2
|
|
; GFX900-NEXT: v_add_f32_e32 v3, s2, v0
|
|
; GFX900-NEXT: v_sub_f32_e32 v0, s0, v1
|
|
; GFX900-NEXT: v_subrev_f32_e32 v1, s3, v3
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fadd_fadd_fsub:
|
|
; PACKED-SDAG: ; %bb.0: ; %bb
|
|
; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; PACKED-SDAG-NEXT: v_mov_b32_e32 v4, 0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s3
|
|
; PACKED-SDAG-NEXT: v_add_f32_e32 v0, s1, v0
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], s[2:3], v[0:1] op_sel_hi:[1,0]
|
|
; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s0
|
|
; PACKED-SDAG-NEXT: v_mov_b32_e32 v3, v0
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[2:3], s[2:3] neg_lo:[0,1] neg_hi:[0,1]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-GISEL-LABEL: fadd_fadd_fsub:
|
|
; GFX90A-GISEL: ; %bb.0: ; %bb
|
|
; GFX90A-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX90A-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], v[0:1]
|
|
; GFX90A-GISEL-NEXT: v_sub_f32_e32 v0, s0, v2
|
|
; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, v1
|
|
; GFX90A-GISEL-NEXT: v_pk_add_f32 v[2:3], s[2:3], v[2:3]
|
|
; GFX90A-GISEL-NEXT: v_subrev_f32_e32 v1, s3, v2
|
|
; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX90A-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
|
|
; GFX90A-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX942-GISEL-LABEL: fadd_fadd_fsub:
|
|
; GFX942-GISEL: ; %bb.0: ; %bb
|
|
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
|
|
; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], v[0:1]
|
|
; GFX942-GISEL-NEXT: s_nop 0
|
|
; GFX942-GISEL-NEXT: v_sub_f32_e32 v0, s0, v2
|
|
; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, v1
|
|
; GFX942-GISEL-NEXT: v_pk_add_f32 v[2:3], s[2:3], v[2:3]
|
|
; GFX942-GISEL-NEXT: s_nop 0
|
|
; GFX942-GISEL-NEXT: v_subrev_f32_e32 v1, s3, v2
|
|
; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX942-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
|
|
; GFX942-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fadd_fadd_fsub:
|
|
; GFX1250-SDAG: ; %bb.0: ; %bb
|
|
; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
|
|
; GFX1250-SDAG-NEXT: s_add_f32 s2, s1, s3
|
|
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_3)
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[0:1], s[2:3] op_sel_hi:[1,0]
|
|
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, v2
|
|
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[4:5], v[0:1] neg_lo:[0,1] neg_hi:[0,1]
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fadd_fadd_fsub:
|
|
; GFX1250-GISEL: ; %bb.0: ; %bb
|
|
; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
|
; GFX1250-GISEL-NEXT: s_sub_f32 s0, s0, s2
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
|
|
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, v1
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[2:3], v[0:1]
|
|
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_subrev_f32 v3, s3, v0
|
|
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v0, v[2:3], s[4:5]
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
bb:
|
|
%i12 = fadd <2 x float> %arg, %arg1
|
|
%shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
|
|
%i13 = fadd <2 x float> %arg1, %shift8
|
|
%i14 = shufflevector <2 x float> %arg, <2 x float> %i13, <2 x i32> <i32 0, i32 2>
|
|
%i15 = fsub <2 x float> %i14, %arg1
|
|
store <2 x float> %i15, ptr addrspace(1) %ptr
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fadd_shuffle_v4(ptr addrspace(1) %arg) {
|
|
; GFX900-LABEL: fadd_shuffle_v4:
|
|
; GFX900: ; %bb.0: ; %bb
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_add_f32_e32 v3, v3, v0
|
|
; GFX900-NEXT: v_add_f32_e32 v2, v2, v0
|
|
; GFX900-NEXT: v_add_f32_e32 v1, v1, v0
|
|
; GFX900-NEXT: v_add_f32_e32 v0, v0, v0
|
|
; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fadd_shuffle_v4:
|
|
; PACKED-SDAG: ; %bb.0: ; %bb
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 4, v0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[0:1] op_sel_hi:[1,0]
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[0:1] op_sel_hi:[1,0]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: fadd_shuffle_v4:
|
|
; PACKED-GISEL: ; %bb.0: ; %bb
|
|
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v6, 4, v0
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_mov_b32_e32 v4, v0
|
|
; PACKED-GISEL-NEXT: v_mov_b32_e32 v5, v0
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[4:5]
|
|
; PACKED-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5]
|
|
; PACKED-GISEL-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1]
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fadd_shuffle_v4:
|
|
; GFX1250-SDAG: ; %bb.0: ; %bb
|
|
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v4, s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[0:1] op_sel_hi:[1,0]
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[0:1] op_sel_hi:[1,0]
|
|
; GFX1250-SDAG-NEXT: global_store_b128 v4, v[0:3], s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fadd_shuffle_v4:
|
|
; GFX1250-GISEL: ; %bb.0: ; %bb
|
|
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v6, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v6, s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v0
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[4:5]
|
|
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5]
|
|
; GFX1250-GISEL-NEXT: global_store_b128 v6, v[0:3], s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
bb:
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid
|
|
%in.1 = load <4 x float>, ptr addrspace(1) %gep
|
|
%shuf = shufflevector <4 x float> %in.1, <4 x float> poison, <4 x i32> zeroinitializer
|
|
%add.1 = fadd <4 x float> %in.1, %shuf
|
|
store <4 x float> %add.1, ptr addrspace(1) %gep
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fneg_v2f32_vec(ptr addrspace(1) %a) {
|
|
; GFX900-LABEL: fneg_v2f32_vec:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX900-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
|
|
; GFX900-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fneg_v2f32_vec:
|
|
; PACKED-SDAG: ; %bb.0:
|
|
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0 neg_lo:[1,1] neg_hi:[1,1]
|
|
; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: fneg_v2f32_vec:
|
|
; PACKED-GISEL: ; %bb.0:
|
|
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1]
|
|
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fneg_v2f32_vec:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0 neg_lo:[1,1] neg_hi:[1,1]
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fneg_v2f32_vec:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1]
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
|
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
|
%fneg = fsub <2 x float> <float -0.0, float -0.0>, %load
|
|
store <2 x float> %fneg, ptr addrspace(1) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fneg_v2f32_scalar(ptr addrspace(1) %a, <2 x float> %x) {
|
|
; GFX900-LABEL: fneg_v2f32_scalar:
|
|
; GFX900: ; %bb.0:
|
|
; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX900-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX900-NEXT: s_xor_b32 s3, s3, 0x80000000
|
|
; GFX900-NEXT: s_xor_b32 s2, s2, 0x80000000
|
|
; GFX900-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX900-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX900-NEXT: s_endpgm
|
|
;
|
|
; PACKED-SDAG-LABEL: fneg_v2f32_scalar:
|
|
; PACKED-SDAG: ; %bb.0:
|
|
; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, 0
|
|
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-SDAG-NEXT: s_xor_b32 s3, s3, 0x80000000
|
|
; PACKED-SDAG-NEXT: s_xor_b32 s2, s2, 0x80000000
|
|
; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s2
|
|
; PACKED-SDAG-NEXT: v_mov_b32_e32 v3, s3
|
|
; PACKED-SDAG-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
|
|
; PACKED-SDAG-NEXT: s_endpgm
|
|
;
|
|
; PACKED-GISEL-LABEL: fneg_v2f32_scalar:
|
|
; PACKED-GISEL: ; %bb.0:
|
|
; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0
|
|
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
|
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, s[2:3] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1]
|
|
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; PACKED-GISEL-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-SDAG-LABEL: fneg_v2f32_scalar:
|
|
; GFX1250-SDAG: ; %bb.0:
|
|
; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-SDAG-NEXT: s_xor_b32 s2, s2, 0x80000000
|
|
; GFX1250-SDAG-NEXT: s_xor_b32 s3, s3, 0x80000000
|
|
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
|
|
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
|
; GFX1250-SDAG-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-GISEL-LABEL: fneg_v2f32_scalar:
|
|
; GFX1250-GISEL: ; %bb.0:
|
|
; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
|
|
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1]
|
|
; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
|
; GFX1250-GISEL-NEXT: s_endpgm
|
|
%fneg = fsub <2 x float> <float -0.0, float -0.0>, %x
|
|
store <2 x float> %fneg, ptr addrspace(1) %a, align 8
|
|
ret void
|
|
}
|
|
|
|
declare i32 @llvm.amdgcn.workitem.id.x()
|
|
declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
|
|
declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
|
|
declare <32 x float> @llvm.fma.v32f32(<32 x float>, <32 x float>, <32 x float>)
|
|
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
|
; GFX90A-SDAG: {{.*}}
|
|
; GFX942-SDAG: {{.*}}
|