llvm-project/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
Matt Arsenault 2502e3b7ba
IR: Promote "denormal-fp-math" to a first class attribute (#174293)
Convert "denormal-fp-math" and "denormal-fp-math-f32" into a first
class denormal_fpenv attribute. Previously the query for the effective
denormal mode involved two string attribute queries with parsing. I'm
introducing more uses of this, so it makes sense to convert this
to a more efficient encoding. The old representation was also awkward
since it was split across two separate attributes. The new encoding
just stores the default and float modes as bitfields, largely avoiding
the need to consider if the other mode is set.

The syntax in the common cases looks like this:
  `denormal_fpenv(preservesign,preservesign)`
  `denormal_fpenv(float: preservesign,preservesign)`
  `denormal_fpenv(dynamic,dynamic float: preservesign,preservesign)`

I wasn't sure about reusing the float type name instead of adding a
new keyword. It's parsed as a type but only accepts float. I'm also
debating switching the name to subnormal to match the current
preferred IEEE terminology (also used by nofpclass and other
contexts).

This has a behavior change when using the command flag debug
options to set the denormal mode. The behavior of the flag
ignored functions with an explicit attribute set, per
the default and f32 version. Now that these are one attribute,
the flag logic can't distinguish which of the two components
were explicitly set on the function. Only one test appeared to
rely on this behavior, so I just avoided using the flags in it.

This also does not perform all the code cleanups this enables.
In particular the attributor handling could be cleaned up.

I also guessed at how to support this in MLIR. I followed
MemoryEffects as a reference; it appears bitfields are expanded
into arguments to attributes, so the representation there is
a bit uglier with the 2 2-element fields flattened into 4 arguments.
2026-02-05 13:31:26 +00:00

8329 lines
334 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=hawaii -start-before=amdgpu-unify-divergent-exit-nodes -mattr=+flat-for-global < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SI %s
; RUN: llc -mtriple=amdgcn -mcpu=fiji -start-before=amdgpu-unify-divergent-exit-nodes < %s | FileCheck -enable-var-scope --check-prefixes=GCN,VI %s
; --------------------------------------------------------------------------------
; fadd tests
; --------------------------------------------------------------------------------
define amdgpu_kernel void @v_fneg_add_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_add_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v5, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_add_f32_e32 v2, v5, v2
; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_add_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_f32_e32 v2, v5, v2
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%add = fadd float %a, %b
%fneg = fneg float %add
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_add_f32_nsz(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_add_f32_nsz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v5, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_sub_f32_e64 v2, -v5, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_add_f32_nsz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_sub_f32_e64 v2, -v5, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%add = fadd nsz float %a, %b
%fneg = fneg nsz float %add
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_add_store_use_add_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_add_store_use_add_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v4, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_f32_e32 v2, v4, v2
; SI-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_add_store_use_add_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_f32_e32 v2, v4, v2
; VI-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%add = fadd float %a, %b
%fneg = fneg float %add
store volatile float %fneg, ptr addrspace(1) %out
store volatile float %add, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_add_multi_use_add_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v4, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_f32_e32 v2, v4, v2
; SI-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
; SI-NEXT: v_mul_f32_e32 v2, 4.0, v2
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_add_multi_use_add_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_f32_e32 v2, v4, v2
; VI-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
; VI-NEXT: v_mul_f32_e32 v2, 4.0, v2
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%add = fadd float %a, %b
%fneg = fneg float %add
%use1 = fmul float %add, 4.0
store volatile float %fneg, ptr addrspace(1) %out
store volatile float %use1, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_add_multi_use_add_f32_nsz(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_add_multi_use_add_f32_nsz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v4, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_sub_f32_e64 v2, -v4, v2
; SI-NEXT: v_mul_f32_e32 v3, -4.0, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_add_multi_use_add_f32_nsz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_sub_f32_e64 v2, -v4, v2
; VI-NEXT: v_mul_f32_e32 v3, -4.0, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%add = fadd nsz float %a, %b
%fneg = fneg float %add
%use1 = fmul float %add, 4.0
store volatile float %fneg, ptr addrspace(1) %out
store volatile float %use1, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_add_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_add_fneg_x_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v1, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_sub_f32_e32 v0, v1, v0
; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_add_fneg_x_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_f32_e32 v0, v1, v0
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%fneg.a = fneg float %a
%add = fadd float %fneg.a, %b
%fneg = fneg float %add
store volatile float %fneg, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_add_fneg_x_f32_nsz(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_add_fneg_x_f32_nsz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v1, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_sub_f32_e32 v2, v0, v1
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_add_fneg_x_f32_nsz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_f32_e32 v2, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%fneg.a = fneg float %a
%add = fadd nsz float %fneg.a, %b
%fneg = fneg float %add
store volatile float %fneg, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_add_x_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_add_x_fneg_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v1, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_sub_f32_e32 v0, v0, v1
; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_add_x_fneg_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_f32_e32 v0, v0, v1
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%fneg.b = fneg float %b
%add = fadd float %a, %fneg.b
%fneg = fneg float %add
store volatile float %fneg, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_add_x_fneg_f32_nsz(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_add_x_fneg_f32_nsz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v1, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_sub_f32_e32 v2, v1, v0
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_add_x_fneg_f32_nsz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_f32_e32 v2, v1, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%fneg.b = fneg float %b
%add = fadd nsz float %a, %fneg.b
%fneg = fneg float %add
store volatile float %fneg, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_add_fneg_fneg_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v1, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_sub_f32_e64 v0, -v0, v1
; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_add_fneg_fneg_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_f32_e64 v0, -v0, v1
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%fneg.a = fneg float %a
%fneg.b = fneg float %b
%add = fadd float %fneg.a, %fneg.b
%fneg = fneg float %add
store volatile float %fneg, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32_nsz(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_add_fneg_fneg_f32_nsz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v1, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_f32_e32 v2, v0, v1
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_add_fneg_fneg_f32_nsz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f32_e32 v2, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%fneg.a = fneg float %a
%fneg.b = fneg float %b
%add = fadd nsz float %fneg.a, %fneg.b
%fneg = fneg float %add
store volatile float %fneg, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_add_store_use_fneg_x_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v4, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_xor_b32_e32 v3, 0x80000000, v4
; SI-NEXT: v_sub_f32_e32 v2, v2, v4
; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_add_store_use_fneg_x_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_xor_b32_e32 v3, 0x80000000, v4
; VI-NEXT: v_sub_f32_e32 v2, v2, v4
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%fneg.a = fneg float %a
%add = fadd float %fneg.a, %b
%fneg = fneg float %add
store volatile float %fneg, ptr addrspace(1) %out
store volatile float %fneg.a, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32_nsz(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_add_store_use_fneg_x_f32_nsz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v4, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_xor_b32_e32 v3, 0x80000000, v4
; SI-NEXT: v_sub_f32_e32 v2, v4, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_add_store_use_fneg_x_f32_nsz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_xor_b32_e32 v3, 0x80000000, v4
; VI-NEXT: v_sub_f32_e32 v2, v4, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%fneg.a = fneg float %a
%add = fadd nsz float %fneg.a, %b
%fneg = fneg float %add
store volatile float %fneg, ptr addrspace(1) %out
store volatile float %fneg.a, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, float %c) #0 {
; SI-LABEL: v_fneg_add_multi_use_fneg_x_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s7
; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v4, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_load_dword s2, s[4:5], 0xf
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mul_f32_e64 v3, -v4, s2
; SI-NEXT: v_sub_f32_e32 v2, v2, v4
; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_add_multi_use_fneg_x_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_load_dword s2, s[4:5], 0x3c
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mul_f32_e64 v3, -v4, s2
; VI-NEXT: v_sub_f32_e32 v2, v2, v4
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%fneg.a = fneg float %a
%add = fadd float %fneg.a, %b
%fneg = fneg float %add
%use1 = fmul float %fneg.a, %c
store volatile float %fneg, ptr addrspace(1) %out
store volatile float %use1, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32_nsz(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, float %c) #0 {
; SI-LABEL: v_fneg_add_multi_use_fneg_x_f32_nsz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s7
; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v4, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_load_dword s2, s[4:5], 0xf
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mul_f32_e64 v3, -v4, s2
; SI-NEXT: v_sub_f32_e32 v2, v4, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_add_multi_use_fneg_x_f32_nsz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_load_dword s2, s[4:5], 0x3c
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mul_f32_e64 v3, -v4, s2
; VI-NEXT: v_sub_f32_e32 v2, v4, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%fneg.a = fneg float %a
%add = fadd nsz float %fneg.a, %b
%fneg = fneg float %add
%use1 = fmul float %fneg.a, %c
store volatile float %fneg, ptr addrspace(1) %out
store volatile float %use1, ptr addrspace(1) %out
ret void
}
; This one asserted with -enable-no-signed-zeros-fp-math
define amdgpu_ps float @fneg_fadd_0_safe(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #0 {
; SI-LABEL: fneg_fadd_0_safe:
; SI: ; %bb.0: ; %.entry
; SI-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0
; SI-NEXT: v_rcp_f32_e32 v1, v0
; SI-NEXT: v_div_scale_f32 v2, vcc, 1.0, s1, 1.0
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; SI-NEXT: v_fma_f32 v3, -v0, v1, 1.0
; SI-NEXT: v_fma_f32 v1, v3, v1, v1
; SI-NEXT: v_mul_f32_e32 v3, v2, v1
; SI-NEXT: v_fma_f32 v4, -v0, v3, v2
; SI-NEXT: v_fma_f32 v3, v4, v1, v3
; SI-NEXT: v_fma_f32 v0, -v0, v3, v2
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; SI-NEXT: v_div_fmas_f32 v0, v0, v1, v3
; SI-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0
; SI-NEXT: v_mad_f32 v0, v0, 0, 0
; SI-NEXT: v_mov_b32_e32 v1, s0
; SI-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0
; SI-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: fneg_fadd_0_safe:
; VI: ; %bb.0: ; %.entry
; VI-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0
; VI-NEXT: v_div_scale_f32 v1, vcc, 1.0, s1, 1.0
; VI-NEXT: v_rcp_f32_e32 v2, v0
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; VI-NEXT: v_fma_f32 v3, -v0, v2, 1.0
; VI-NEXT: v_fma_f32 v2, v3, v2, v2
; VI-NEXT: v_mul_f32_e32 v3, v1, v2
; VI-NEXT: v_fma_f32 v4, -v0, v3, v1
; VI-NEXT: v_fma_f32 v3, v4, v2, v3
; VI-NEXT: v_fma_f32 v0, -v0, v3, v1
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; VI-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; VI-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0
; VI-NEXT: v_mad_f32 v0, v0, 0, 0
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0
; VI-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc
; VI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
; VI-NEXT: ; return to shader part epilog
.entry:
%tmp7 = fdiv float 1.000000e+00, %tmp6
%tmp8 = fmul float 0.000000e+00, %tmp7
%tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8
%.i188 = fadd float %tmp9, 0.000000e+00
%tmp10 = fcmp uge float %.i188, %tmp2
%tmp11 = fneg float %.i188
%.i092 = select i1 %tmp10, float %tmp2, float %tmp11
%tmp12 = fcmp ule float %.i092, 0.000000e+00
%.i198 = select i1 %tmp12, float 0.000000e+00, float 0x7FF8000000000000
ret float %.i198
}
define amdgpu_ps float @fneg_fadd_0_nsz(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr {
; GCN-LABEL: fneg_fadd_0_nsz:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: v_rcp_f32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_mul_f32_e32 v0, 0, v0
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0
; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
; GCN-NEXT: ; return to shader part epilog
.entry:
%tmp7 = fdiv afn float 1.000000e+00, %tmp6
%tmp8 = fmul float 0.000000e+00, %tmp7
%tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8
%.i188 = fadd nsz float %tmp9, 0.000000e+00
%tmp10 = fcmp uge float %.i188, %tmp2
%tmp11 = fneg float %.i188
%.i092 = select i1 %tmp10, float %tmp2, float %tmp11
%tmp12 = fcmp ule float %.i092, 0.000000e+00
%.i198 = select i1 %tmp12, float 0.000000e+00, float 0x7FF8000000000000
ret float %.i198
}
; --------------------------------------------------------------------------------
; fmul tests
; --------------------------------------------------------------------------------
define amdgpu_kernel void @v_fneg_mul_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_mul_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v5, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mul_f32_e64 v2, v5, -v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_mul_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mul_f32_e64 v2, v5, -v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%mul = fmul float %a, %b
%fneg = fneg float %mul
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_mul_store_use_mul_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_mul_store_use_mul_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v4, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_mul_f32_e32 v2, v4, v2
; SI-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_mul_store_use_mul_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mul_f32_e32 v2, v4, v2
; VI-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%mul = fmul float %a, %b
%fneg = fneg float %mul
store volatile float %fneg, ptr addrspace(1) %out
store volatile float %mul, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_mul_multi_use_mul_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v4, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_mul_f32_e64 v2, v4, -v2
; SI-NEXT: v_mul_f32_e32 v3, -4.0, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_mul_multi_use_mul_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mul_f32_e64 v2, v4, -v2
; VI-NEXT: v_mul_f32_e32 v3, -4.0, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%mul = fmul float %a, %b
%fneg = fneg float %mul
%use1 = fmul float %mul, 4.0
store volatile float %fneg, ptr addrspace(1) %out
store volatile float %use1, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_mul_fneg_x_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v1, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, v0, v1
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_mul_fneg_x_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_f32_e32 v2, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%fneg.a = fneg float %a
%mul = fmul float %fneg.a, %b
%fneg = fneg float %mul
store volatile float %fneg, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_mul_x_fneg_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v1, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, v0, v1
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_mul_x_fneg_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_f32_e32 v2, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%fneg.b = fneg float %b
%mul = fmul float %a, %fneg.b
%fneg = fneg float %mul
store volatile float %fneg, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_mul_fneg_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_mul_fneg_fneg_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v1, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e64 v2, v0, -v1
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_mul_fneg_fneg_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_f32_e64 v2, v0, -v1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%fneg.a = fneg float %a
%fneg.b = fneg float %b
%mul = fmul float %fneg.a, %fneg.b
%fneg = fneg float %mul
store volatile float %fneg, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_mul_store_use_fneg_x_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v4, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_xor_b32_e32 v3, 0x80000000, v4
; SI-NEXT: v_mul_f32_e32 v2, v4, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_mul_store_use_fneg_x_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_xor_b32_e32 v3, 0x80000000, v4
; VI-NEXT: v_mul_f32_e32 v2, v4, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%fneg.a = fneg float %a
%mul = fmul float %fneg.a, %b
%fneg = fneg float %mul
store volatile float %fneg, ptr addrspace(1) %out
store volatile float %fneg.a, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_mul_multi_use_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, float %c) #0 {
; SI-LABEL: v_fneg_mul_multi_use_fneg_x_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s7
; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v4, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_load_dword s2, s[4:5], 0xf
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mul_f32_e64 v3, -v4, s2
; SI-NEXT: v_mul_f32_e32 v2, v4, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_mul_multi_use_fneg_x_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_load_dword s2, s[4:5], 0x3c
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mul_f32_e64 v3, -v4, s2
; VI-NEXT: v_mul_f32_e32 v2, v4, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%fneg.a = fneg float %a
%mul = fmul float %fneg.a, %b
%fneg = fneg float %mul
%use1 = fmul float %fneg.a, %c
store volatile float %fneg, ptr addrspace(1) %out
store volatile float %use1, ptr addrspace(1) %out
ret void
}
; --------------------------------------------------------------------------------
; fminnum tests
; --------------------------------------------------------------------------------
define amdgpu_kernel void @v_fneg_minnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_minnum_f32_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v5, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mul_f32_e32 v3, -1.0, v5
; SI-NEXT: v_mul_f32_e32 v2, -1.0, v2
; SI-NEXT: v_max_f32_e32 v2, v3, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_minnum_f32_ieee:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mul_f32_e32 v3, -1.0, v5
; VI-NEXT: v_mul_f32_e32 v2, -1.0, v2
; VI-NEXT: v_max_f32_e32 v2, v3, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%min = call float @llvm.minnum.f32(float %a, float %b)
%fneg = fneg float %min
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_ps float @v_fneg_minnum_f32_no_ieee(float %a, float %b) #0 {
; GCN-LABEL: v_fneg_minnum_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: v_max_f32_e64 v0, -v0, -v1
; GCN-NEXT: ; return to shader part epilog
%min = call float @llvm.minnum.f32(float %a, float %b)
%fneg = fneg float %min
ret float %fneg
}
define amdgpu_kernel void @v_fneg_self_minnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_self_minnum_f32_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_dword v3, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mul_f32_e32 v2, -1.0, v3
; SI-NEXT: v_max_f32_e32 v2, v2, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_self_minnum_f32_ieee:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mul_f32_e32 v2, -1.0, v3
; VI-NEXT: v_max_f32_e32 v2, v2, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%min = call float @llvm.minnum.f32(float %a, float %a)
%min.fneg = fneg float %min
store float %min.fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_ps float @v_fneg_self_minnum_f32_no_ieee(float %a) #0 {
; GCN-LABEL: v_fneg_self_minnum_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: v_max_f32_e64 v0, -v0, -v0
; GCN-NEXT: ; return to shader part epilog
%min = call float @llvm.minnum.f32(float %a, float %a)
%min.fneg = fneg float %min
ret float %min.fneg
}
define amdgpu_kernel void @v_fneg_posk_minnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_posk_minnum_f32_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_dword v3, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mul_f32_e32 v2, -1.0, v3
; SI-NEXT: v_max_f32_e32 v2, -4.0, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_posk_minnum_f32_ieee:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mul_f32_e32 v2, -1.0, v3
; VI-NEXT: v_max_f32_e32 v2, -4.0, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%min = call float @llvm.minnum.f32(float 4.0, float %a)
%fneg = fneg float %min
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_ps float @v_fneg_posk_minnum_f32_no_ieee(float %a) #0 {
; GCN-LABEL: v_fneg_posk_minnum_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: v_max_f32_e64 v0, -v0, -4.0
; GCN-NEXT: ; return to shader part epilog
%min = call float @llvm.minnum.f32(float 4.0, float %a)
%fneg = fneg float %min
ret float %fneg
}
define amdgpu_kernel void @v_fneg_negk_minnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_negk_minnum_f32_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_dword v3, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mul_f32_e32 v2, -1.0, v3
; SI-NEXT: v_max_f32_e32 v2, 4.0, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_negk_minnum_f32_ieee:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mul_f32_e32 v2, -1.0, v3
; VI-NEXT: v_max_f32_e32 v2, 4.0, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%min = call float @llvm.minnum.f32(float -4.0, float %a)
%fneg = fneg float %min
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_ps float @v_fneg_negk_minnum_f32_no_ieee(float %a) #0 {
; GCN-LABEL: v_fneg_negk_minnum_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: v_max_f32_e64 v0, -v0, 4.0
; GCN-NEXT: ; return to shader part epilog
%min = call float @llvm.minnum.f32(float -4.0, float %a)
%fneg = fneg float %min
ret float %fneg
}
define amdgpu_kernel void @v_fneg_0_minnum_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_0_minnum_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_dword v3, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_min_f32_e32 v2, 0, v3
; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_0_minnum_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_min_f32_e32 v2, 0, v3
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%min = call nnan float @llvm.minnum.f32(float 0.0, float %a)
%fneg = fneg float %min
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_neg0_minnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_neg0_minnum_f32_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_dword v3, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mul_f32_e32 v2, -1.0, v3
; SI-NEXT: v_max_f32_e32 v2, 0, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_neg0_minnum_f32_ieee:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mul_f32_e32 v2, -1.0, v3
; VI-NEXT: v_max_f32_e32 v2, 0, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%min = call float @llvm.minnum.f32(float -0.0, float %a)
%fneg = fneg float %min
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_inv2pi_minnum_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_inv2pi_minnum_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_dword v3, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mul_f32_e32 v2, -1.0, v3
; SI-NEXT: v_max_f32_e32 v2, 0xbe22f983, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_inv2pi_minnum_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mul_f32_e32 v2, 1.0, v3
; VI-NEXT: v_min_f32_e32 v2, 0.15915494, v2
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%min = call float @llvm.minnum.f32(float 0x3FC45F3060000000, float %a)
%fneg = fneg float %min
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_neg_inv2pi_minnum_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_dword v3, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mul_f32_e32 v2, -1.0, v3
; SI-NEXT: v_max_f32_e32 v2, 0x3e22f983, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_neg_inv2pi_minnum_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mul_f32_e32 v2, -1.0, v3
; VI-NEXT: v_max_f32_e32 v2, 0.15915494, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%min = call float @llvm.minnum.f32(float 0xBFC45F3060000000, float %a)
%fneg = fneg float %min
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_inv2pi_minnum_f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_inv2pi_minnum_f16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_ushort v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v3, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_store_short v[0:1], v3
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_inv2pi_minnum_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_max_f16_e32 v2, v3, v3
; VI-NEXT: v_min_f16_e32 v2, 0.15915494, v2
; VI-NEXT: v_xor_b32_e32 v2, 0x8000, v2
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds half, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile half, ptr addrspace(1) %a.gep
%min = call half @llvm.minnum.f16(half 0xH3118, half %a)
%fneg = fsub half -0.000000e+00, %min
store half %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_neg_inv2pi_minnum_f16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_ushort v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_max_f32_e32 v0, 0x3e230000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v3, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_store_short v[0:1], v3
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_neg_inv2pi_minnum_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_max_f16_e64 v2, -v3, -v3
; VI-NEXT: v_max_f16_e32 v2, 0.15915494, v2
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds half, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile half, ptr addrspace(1) %a.gep
%min = call half @llvm.minnum.f16(half 0xHB118, half %a)
%fneg = fsub half -0.000000e+00, %min
store half %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_inv2pi_minnum_f64(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_inv2pi_minnum_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b32 s2, 0x6dc9c882
; SI-NEXT: s_mov_b32 s3, 0xbfc45f30
; SI-NEXT: v_mov_b32_e32 v3, s1
; SI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1]
; SI-NEXT: v_max_f64 v[0:1], v[0:1], s[2:3]
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_inv2pi_minnum_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; VI-NEXT: v_min_f64 v[0:1], v[0:1], 0.15915494309189532
; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile double, ptr addrspace(1) %a.gep
%min = call double @llvm.minnum.f64(double 0x3fc45f306dc9c882, double %a)
%fneg = fsub double -0.000000e+00, %min
store double %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f64(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_neg_inv2pi_minnum_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b32 s2, 0x6dc9c882
; SI-NEXT: s_mov_b32 s3, 0x3fc45f30
; SI-NEXT: v_mov_b32_e32 v3, s1
; SI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1]
; SI-NEXT: v_max_f64 v[0:1], v[0:1], s[2:3]
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_neg_inv2pi_minnum_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1]
; VI-NEXT: v_max_f64 v[0:1], v[0:1], 0.15915494309189532
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile double, ptr addrspace(1) %a.gep
%min = call double @llvm.minnum.f64(double 0xbfc45f306dc9c882, double %a)
%fneg = fsub double -0.000000e+00, %min
store double %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_ps float @v_fneg_neg0_minnum_f32_no_ieee(float %a) #0 {
; GCN-LABEL: v_fneg_neg0_minnum_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: v_max_f32_e64 v0, -v0, 0
; GCN-NEXT: ; return to shader part epilog
%min = call float @llvm.minnum.f32(float -0.0, float %a)
%fneg = fneg float %min
ret float %fneg
}
define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_0_minnum_foldable_use_f32_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: flat_load_dword v4, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v3, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4
; SI-NEXT: v_min_f32_e32 v2, 0, v2
; SI-NEXT: v_mul_f32_e64 v2, -v2, v3
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_0_minnum_foldable_use_f32_ieee:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mul_f32_e32 v2, 1.0, v4
; VI-NEXT: v_min_f32_e32 v2, 0, v2
; VI-NEXT: v_mul_f32_e64 v2, -v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%min = call float @llvm.minnum.f32(float 0.0, float %a)
%fneg = fneg float %min
%mul = fmul float %fneg, %b
store float %mul, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_inv2pi_minnum_foldable_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_inv2pi_minnum_foldable_use_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: flat_load_dword v4, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v3, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mul_f32_e32 v2, -1.0, v4
; SI-NEXT: v_max_f32_e32 v2, 0xbe22f983, v2
; SI-NEXT: v_mul_f32_e32 v2, v2, v3
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_inv2pi_minnum_foldable_use_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mul_f32_e32 v2, 1.0, v4
; VI-NEXT: v_min_f32_e32 v2, 0.15915494, v2
; VI-NEXT: v_mul_f32_e64 v2, -v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%min = call float @llvm.minnum.f32(float 0x3FC45F3060000000, float %a)
%fneg = fneg float %min
%mul = fmul float %fneg, %b
store float %mul, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_ps float @v_fneg_0_minnum_foldable_use_f32_no_ieee(float %a, float %b) #0 {
; GCN-LABEL: v_fneg_0_minnum_foldable_use_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: v_min_f32_e32 v0, 0, v0
; GCN-NEXT: v_mul_f32_e64 v0, -v0, v1
; GCN-NEXT: ; return to shader part epilog
%min = call float @llvm.minnum.f32(float 0.0, float %a)
%fneg = fneg float %min
%mul = fmul float %fneg, %b
ret float %mul
}
define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_minnum_multi_use_minnum_f32_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v4, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_mul_f32_e32 v3, -1.0, v4
; SI-NEXT: v_mul_f32_e32 v2, -1.0, v2
; SI-NEXT: v_max_f32_e32 v2, v3, v2
; SI-NEXT: v_mul_f32_e32 v3, -4.0, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_minnum_multi_use_minnum_f32_ieee:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mul_f32_e32 v3, -1.0, v4
; VI-NEXT: v_mul_f32_e32 v2, -1.0, v2
; VI-NEXT: v_max_f32_e32 v2, v3, v2
; VI-NEXT: v_mul_f32_e32 v3, -4.0, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%min = call float @llvm.minnum.f32(float %a, float %b)
%fneg = fneg float %min
%use1 = fmul float %min, 4.0
store volatile float %fneg, ptr addrspace(1) %out
store volatile float %use1, ptr addrspace(1) %out
ret void
}
define amdgpu_ps <2 x float> @v_fneg_minnum_multi_use_minnum_f32_no_ieee(float %a, float %b) #0 {
; GCN-LABEL: v_fneg_minnum_multi_use_minnum_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: v_max_f32_e64 v0, -v0, -v1
; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0
; GCN-NEXT: ; return to shader part epilog
%min = call float @llvm.minnum.f32(float %a, float %b)
%fneg = fneg float %min
%use1 = fmul float %min, 4.0
%ins0 = insertelement <2 x float> poison, float %fneg, i32 0
%ins1 = insertelement <2 x float> %ins0, float %use1, i32 1
ret <2 x float> %ins1
}
; --------------------------------------------------------------------------------
; fmaxnum tests
; --------------------------------------------------------------------------------
define amdgpu_kernel void @v_fneg_maxnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_maxnum_f32_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v5, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mul_f32_e32 v3, -1.0, v5
; SI-NEXT: v_mul_f32_e32 v2, -1.0, v2
; SI-NEXT: v_min_f32_e32 v2, v3, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_maxnum_f32_ieee:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mul_f32_e32 v3, -1.0, v5
; VI-NEXT: v_mul_f32_e32 v2, -1.0, v2
; VI-NEXT: v_min_f32_e32 v2, v3, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%max = call float @llvm.maxnum.f32(float %a, float %b)
%fneg = fneg float %max
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_ps float @v_fneg_maxnum_f32_no_ieee(float %a, float %b) #0 {
; GCN-LABEL: v_fneg_maxnum_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: v_min_f32_e64 v0, -v0, -v1
; GCN-NEXT: ; return to shader part epilog
%max = call float @llvm.maxnum.f32(float %a, float %b)
%fneg = fneg float %max
ret float %fneg
}
define amdgpu_kernel void @v_fneg_self_maxnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_self_maxnum_f32_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_dword v3, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mul_f32_e32 v2, -1.0, v3
; SI-NEXT: v_min_f32_e32 v2, v2, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_self_maxnum_f32_ieee:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mul_f32_e32 v2, -1.0, v3
; VI-NEXT: v_min_f32_e32 v2, v2, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%max = call float @llvm.maxnum.f32(float %a, float %a)
%max.fneg = fneg float %max
store float %max.fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_ps float @v_fneg_self_maxnum_f32_no_ieee(float %a) #0 {
; GCN-LABEL: v_fneg_self_maxnum_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: v_min_f32_e64 v0, -v0, -v0
; GCN-NEXT: ; return to shader part epilog
%max = call float @llvm.maxnum.f32(float %a, float %a)
%max.fneg = fneg float %max
ret float %max.fneg
}
define amdgpu_kernel void @v_fneg_posk_maxnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_posk_maxnum_f32_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_dword v3, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mul_f32_e32 v2, -1.0, v3
; SI-NEXT: v_min_f32_e32 v2, -4.0, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_posk_maxnum_f32_ieee:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mul_f32_e32 v2, -1.0, v3
; VI-NEXT: v_min_f32_e32 v2, -4.0, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%max = call float @llvm.maxnum.f32(float 4.0, float %a)
%fneg = fneg float %max
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_ps float @v_fneg_posk_maxnum_f32_no_ieee(float %a) #0 {
; GCN-LABEL: v_fneg_posk_maxnum_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: v_min_f32_e64 v0, -v0, -4.0
; GCN-NEXT: ; return to shader part epilog
%max = call float @llvm.maxnum.f32(float 4.0, float %a)
%fneg = fneg float %max
ret float %fneg
}
define amdgpu_kernel void @v_fneg_negk_maxnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_negk_maxnum_f32_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_dword v3, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mul_f32_e32 v2, -1.0, v3
; SI-NEXT: v_min_f32_e32 v2, 4.0, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_negk_maxnum_f32_ieee:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mul_f32_e32 v2, -1.0, v3
; VI-NEXT: v_min_f32_e32 v2, 4.0, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%max = call float @llvm.maxnum.f32(float -4.0, float %a)
%fneg = fneg float %max
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_ps float @v_fneg_negk_maxnum_f32_no_ieee(float %a) #0 {
; GCN-LABEL: v_fneg_negk_maxnum_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: v_min_f32_e64 v0, -v0, 4.0
; GCN-NEXT: ; return to shader part epilog
%max = call float @llvm.maxnum.f32(float -4.0, float %a)
%fneg = fneg float %max
ret float %fneg
}
define amdgpu_kernel void @v_fneg_0_maxnum_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_0_maxnum_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_dword v3, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_max_f32_e32 v2, 0, v3
; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_0_maxnum_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_max_f32_e32 v2, 0, v3
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%max = call nnan float @llvm.maxnum.f32(float 0.0, float %a)
%fneg = fneg float %max
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_neg0_maxnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_neg0_maxnum_f32_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_dword v3, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mul_f32_e32 v2, -1.0, v3
; SI-NEXT: v_min_f32_e32 v2, 0, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_neg0_maxnum_f32_ieee:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mul_f32_e32 v2, -1.0, v3
; VI-NEXT: v_min_f32_e32 v2, 0, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%max = call float @llvm.maxnum.f32(float -0.0, float %a)
%fneg = fneg float %max
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_ps float @v_fneg_neg0_maxnum_f32_no_ieee(float %a) #0 {
; GCN-LABEL: v_fneg_neg0_maxnum_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: v_min_f32_e64 v0, -v0, 0
; GCN-NEXT: ; return to shader part epilog
%max = call float @llvm.maxnum.f32(float -0.0, float %a)
%fneg = fneg float %max
ret float %fneg
}
define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_0_maxnum_foldable_use_f32_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: flat_load_dword v4, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v3, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4
; SI-NEXT: v_max_f32_e32 v2, 0, v2
; SI-NEXT: v_mul_f32_e64 v2, -v2, v3
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_0_maxnum_foldable_use_f32_ieee:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mul_f32_e32 v2, 1.0, v4
; VI-NEXT: v_max_f32_e32 v2, 0, v2
; VI-NEXT: v_mul_f32_e64 v2, -v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%max = call float @llvm.maxnum.f32(float 0.0, float %a)
%fneg = fneg float %max
%mul = fmul float %fneg, %b
store float %mul, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_ps float @v_fneg_0_maxnum_foldable_use_f32_no_ieee(float %a, float %b) #0 {
; GCN-LABEL: v_fneg_0_maxnum_foldable_use_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: v_max_f32_e32 v0, 0, v0
; GCN-NEXT: v_mul_f32_e64 v0, -v0, v1
; GCN-NEXT: ; return to shader part epilog
%max = call float @llvm.maxnum.f32(float 0.0, float %a)
%fneg = fneg float %max
%mul = fmul float %fneg, %b
ret float %mul
}
define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_maxnum_multi_use_maxnum_f32_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v4, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_mul_f32_e32 v3, -1.0, v4
; SI-NEXT: v_mul_f32_e32 v2, -1.0, v2
; SI-NEXT: v_min_f32_e32 v2, v3, v2
; SI-NEXT: v_mul_f32_e32 v3, -4.0, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_maxnum_multi_use_maxnum_f32_ieee:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mul_f32_e32 v3, -1.0, v4
; VI-NEXT: v_mul_f32_e32 v2, -1.0, v2
; VI-NEXT: v_min_f32_e32 v2, v3, v2
; VI-NEXT: v_mul_f32_e32 v3, -4.0, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%max = call float @llvm.maxnum.f32(float %a, float %b)
%fneg = fneg float %max
%use1 = fmul float %max, 4.0
store volatile float %fneg, ptr addrspace(1) %out
store volatile float %use1, ptr addrspace(1) %out
ret void
}
define amdgpu_ps <2 x float> @v_fneg_maxnum_multi_use_maxnum_f32_no_ieee(float %a, float %b) #0 {
; GCN-LABEL: v_fneg_maxnum_multi_use_maxnum_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: v_min_f32_e64 v0, -v0, -v1
; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0
; GCN-NEXT: ; return to shader part epilog
%max = call float @llvm.maxnum.f32(float %a, float %b)
%fneg = fneg float %max
%use1 = fmul float %max, 4.0
%ins0 = insertelement <2 x float> poison, float %fneg, i32 0
%ins1 = insertelement <2 x float> %ins0, float %use1, i32 1
ret <2 x float> %ins1
}
; --------------------------------------------------------------------------------
; fma tests
; --------------------------------------------------------------------------------
define amdgpu_kernel void @v_fneg_fma_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: v_fneg_fma_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v6
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v6
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s7
; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v6
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v7, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v3, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v6
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_fma_f32 v2, v7, v2, v3
; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fma_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v7, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v3, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_fma_f32 v2, v7, v2, v3
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%fma = call float @llvm.fma.f32(float %a, float %b, float %c)
%fneg = fneg float %fma
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_fma_f32_nsz(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: v_fneg_fma_f32_nsz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v6
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v6
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s7
; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v6
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v7, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v3, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v6
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_fma_f32 v2, v7, -v2, -v3
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fma_f32_nsz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v7, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v3, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_fma_f32 v2, v7, -v2, -v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%fma = call nsz float @llvm.fma.f32(float %a, float %b, float %c)
%fneg = fneg float %fma
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: v_fneg_fma_store_use_fma_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s7
; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v6, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v3, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_fma_f32 v2, v6, v2, v3
; SI-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fma_store_use_fma_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v6, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v3, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_fma_f32 v2, v6, v2, v3
; VI-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%fma = call float @llvm.fma.f32(float %a, float %b, float %c)
%fneg = fneg float %fma
store volatile float %fneg, ptr addrspace(1) %out
store volatile float %fma, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32_nsz(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: v_fneg_fma_store_use_fma_f32_nsz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s7
; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v6, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v3, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_fma_f32 v2, v6, v2, v3
; SI-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fma_store_use_fma_f32_nsz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v6, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v3, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_fma_f32 v2, v6, v2, v3
; VI-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%fma = call nsz float @llvm.fma.f32(float %a, float %b, float %c)
%fneg = fneg float %fma
store volatile float %fneg, ptr addrspace(1) %out
store volatile float %fma, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_fma_multi_use_fma_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: v_fneg_fma_multi_use_fma_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s7
; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v6, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v3, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_fma_f32 v2, v6, v2, v3
; SI-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
; SI-NEXT: v_mul_f32_e32 v2, 4.0, v2
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fma_multi_use_fma_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v6, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v3, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_fma_f32 v2, v6, v2, v3
; VI-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
; VI-NEXT: v_mul_f32_e32 v2, 4.0, v2
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%fma = call float @llvm.fma.f32(float %a, float %b, float %c)
%fneg = fneg float %fma
%use1 = fmul float %fma, 4.0
store volatile float %fneg, ptr addrspace(1) %out
store volatile float %use1, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_fma_multi_use_fma_f32_nsz(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: v_fneg_fma_multi_use_fma_f32_nsz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s7
; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v6, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v3, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_fma_f32 v2, v6, -v2, -v3
; SI-NEXT: v_mul_f32_e32 v3, -4.0, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fma_multi_use_fma_f32_nsz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v6, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v3, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_fma_f32 v2, v6, -v2, -v3
; VI-NEXT: v_mul_f32_e32 v3, -4.0, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%fma = call nsz float @llvm.fma.f32(float %a, float %b, float %c)
%fneg = fneg float %fma
%use1 = fmul float %fma, 4.0
store volatile float %fneg, ptr addrspace(1) %out
store volatile float %use1, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_fma_fneg_x_y_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: v_fneg_fma_fneg_x_y_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s7
; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v1, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_fma_f32 v0, -v0, v1, v2
; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fma_fneg_x_y_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_fma_f32 v0, -v0, v1, v2
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%fneg.a = fneg float %a
%fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
%fneg = fneg float %fma
store volatile float %fneg, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_fma_fneg_x_y_f32_nsz(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: v_fneg_fma_fneg_x_y_f32_nsz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s7
; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v1, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_fma_f32 v2, v0, v1, -v2
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fma_fneg_x_y_f32_nsz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_fma_f32 v2, v0, v1, -v2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%fneg.a = fneg float %a
%fma = call nsz float @llvm.fma.f32(float %fneg.a, float %b, float %c)
%fneg = fneg float %fma
store volatile float %fneg, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_fma_x_fneg_y_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: v_fneg_fma_x_fneg_y_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s7
; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s5
; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v1, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_fma_f32 v0, v0, -v1, v2
; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fma_x_fneg_y_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_fma_f32 v0, v0, -v1, v2
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%fneg.b = fneg float %b
%fma = call float @llvm.fma.f32(float %a, float %fneg.b, float %c)
%fneg = fneg float %fma
store volatile float %fneg, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_fma_x_fneg_y_f32_nsz(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: v_fneg_fma_x_fneg_y_f32_nsz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s7
; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s5
; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v1, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_fma_f32 v2, v0, v1, -v2
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fma_x_fneg_y_f32_nsz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_fma_f32 v2, v0, v1, -v2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%fneg.b = fneg float %b
%fma = call nsz float @llvm.fma.f32(float %a, float %fneg.b, float %c)
%fneg = fneg float %fma
store volatile float %fneg, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_fma_fneg_fneg_y_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: v_fneg_fma_fneg_fneg_y_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s7
; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v1, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_fma_f32 v0, v0, v1, v2
; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fma_fneg_fneg_y_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_fma_f32 v0, v0, v1, v2
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%fneg.a = fneg float %a
%fneg.b = fneg float %b
%fma = call float @llvm.fma.f32(float %fneg.a, float %fneg.b, float %c)
%fneg = fneg float %fma
store volatile float %fneg, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_fma_fneg_fneg_y_f32_nsz(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: v_fneg_fma_fneg_fneg_y_f32_nsz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s7
; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v1, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_fma_f32 v2, v0, -v1, -v2
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fma_fneg_fneg_y_f32_nsz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_fma_f32 v2, v0, -v1, -v2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%fneg.a = fneg float %a
%fneg.b = fneg float %b
%fma = call nsz float @llvm.fma.f32(float %fneg.a, float %fneg.b, float %c)
%fneg = fneg float %fma
store volatile float %fneg, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_fma_fneg_x_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: v_fneg_fma_fneg_x_fneg_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s7
; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v1, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_fma_f32 v0, -v0, v1, -v2
; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fma_fneg_x_fneg_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_fma_f32 v0, -v0, v1, -v2
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%fneg.a = fneg float %a
%fneg.c = fneg float %c
%fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %fneg.c)
%fneg = fneg float %fma
store volatile float %fneg, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_fma_fneg_x_fneg_f32_nsz(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: v_fneg_fma_fneg_x_fneg_f32_nsz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s7
; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v1, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_fma_f32 v2, v0, v1, v2
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fma_fneg_x_fneg_f32_nsz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_fma_f32 v2, v0, v1, v2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%fneg.a = fneg float %a
%fneg.c = fneg float %c
%fma = call nsz float @llvm.fma.f32(float %fneg.a, float %b, float %fneg.c)
%fneg = fneg float %fma
store volatile float %fneg, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_fma_x_y_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: v_fneg_fma_x_y_fneg_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s7
; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v1, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_fma_f32 v0, v0, v1, -v2
; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fma_x_y_fneg_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_fma_f32 v0, v0, v1, -v2
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%fneg.c = fneg float %c
%fma = call float @llvm.fma.f32(float %a, float %b, float %fneg.c)
%fneg = fneg float %fma
store volatile float %fneg, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_fma_x_y_fneg_f32_nsz(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: v_fneg_fma_x_y_fneg_f32_nsz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s7
; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v1, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_fma_f32 v2, v0, -v1, v2
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fma_x_y_fneg_f32_nsz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_fma_f32 v2, v0, -v1, v2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%fneg.c = fneg float %c
%fma = call nsz float @llvm.fma.f32(float %a, float %b, float %fneg.c)
%fneg = fneg float %fma
store volatile float %fneg, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_fma_store_use_fneg_x_y_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: v_fneg_fma_store_use_fneg_x_y_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s7
; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v6, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v3, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_xor_b32_e32 v4, 0x80000000, v6
; SI-NEXT: v_fma_f32 v2, -v6, v2, v3
; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v4
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fma_store_use_fneg_x_y_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v6, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v3, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_xor_b32_e32 v4, 0x80000000, v6
; VI-NEXT: v_fma_f32 v2, -v6, v2, v3
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v4
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%fneg.a = fneg float %a
%fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
%fneg = fneg float %fma
store volatile float %fneg, ptr addrspace(1) %out
store volatile float %fneg.a, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_fma_store_use_fneg_x_y_f32_nsz(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: v_fneg_fma_store_use_fneg_x_y_f32_nsz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s7
; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v6, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v3, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_xor_b32_e32 v4, 0x80000000, v6
; SI-NEXT: v_fma_f32 v2, v6, v2, -v3
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v4
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fma_store_use_fneg_x_y_f32_nsz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v6, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v3, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_xor_b32_e32 v4, 0x80000000, v6
; VI-NEXT: v_fma_f32 v2, v6, v2, -v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v4
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%fneg.a = fneg float %a
%fma = call nsz float @llvm.fma.f32(float %fneg.a, float %b, float %c)
%fneg = fneg float %fma
store volatile float %fneg, ptr addrspace(1) %out
store volatile float %fneg.a, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr, float %d) #0 {
; SI-LABEL: v_fneg_fma_multi_use_fneg_x_y_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_load_dword s0, s[4:5], 0x11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s11
; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s13
; SI-NEXT: v_add_i32_e32 v2, vcc, s12, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s15
; SI-NEXT: v_add_i32_e32 v4, vcc, s14, v4
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v6, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v3, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: v_fma_f32 v2, -v6, v2, v3
; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; SI-NEXT: v_mul_f32_e64 v3, -v6, s0
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fma_multi_use_fneg_x_y_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_load_dword s0, s[4:5], 0x44
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s11
; VI-NEXT: v_add_u32_e32 v0, vcc, s10, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s13
; VI-NEXT: v_add_u32_e32 v2, vcc, s12, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s15
; VI-NEXT: v_add_u32_e32 v4, vcc, s14, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v6, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v3, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_fma_f32 v2, -v6, v2, v3
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; VI-NEXT: v_mul_f32_e64 v3, -v6, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%fneg.a = fneg float %a
%fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
%fneg = fneg float %fma
%use1 = fmul float %fneg.a, %d
store volatile float %fneg, ptr addrspace(1) %out
store volatile float %use1, ptr addrspace(1) %out
ret void
}
; --------------------------------------------------------------------------------
; fmad tests
; --------------------------------------------------------------------------------
define amdgpu_kernel void @v_fneg_fmad_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: v_fneg_fmad_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v6
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v6
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s7
; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v6
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v7, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v3, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v6
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mac_f32_e32 v3, v7, v2
; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v3
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fmad_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v7, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v3, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mac_f32_e32 v3, v7, v2
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
%fneg = fneg float %fma
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_fmad_f32_nsz(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: v_fneg_fmad_f32_nsz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v6
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v6
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s7
; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v6
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v7, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v3, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v6
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mad_f32 v2, v7, -v2, -v3
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fmad_f32_nsz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v7, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v3, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mad_f32 v2, v7, -v2, -v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%fma = call nsz float @llvm.fmuladd.f32(float %a, float %b, float %c)
%fneg = fneg float %fma
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_fmad_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: v_fneg_fmad_v4f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v12, 4, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v12
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v2, s5
; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v12
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v2, vcc
; SI-NEXT: v_mov_b32_e32 v2, s7
; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v12
; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v2, vcc
; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v13, s1
; SI-NEXT: v_add_i32_e32 v12, vcc, s0, v12
; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc
; SI-NEXT: v_mad_f32 v0, v0, v4, v8
; SI-NEXT: v_mad_f32 v1, v1, v5, v9
; SI-NEXT: v_mad_f32 v2, v2, v6, v10
; SI-NEXT: v_mac_f32_e32 v11, v3, v7
; SI-NEXT: v_xor_b32_e32 v3, 0x80000000, v11
; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; SI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; SI-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fmad_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v12, 4, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v12
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v12
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v2, vcc
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v8, vcc, s6, v12
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v13, s1
; VI-NEXT: v_add_u32_e32 v12, vcc, s0, v12
; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc
; VI-NEXT: v_mad_f32 v0, v0, v4, v8
; VI-NEXT: v_mad_f32 v1, v1, v5, v9
; VI-NEXT: v_mad_f32 v2, v2, v6, v10
; VI-NEXT: v_mac_f32_e32 v11, v3, v7
; VI-NEXT: v_xor_b32_e32 v3, 0x80000000, v11
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; VI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile <4 x float>, ptr addrspace(1) %a.gep
%b = load volatile <4 x float>, ptr addrspace(1) %b.gep
%c = load volatile <4 x float>, ptr addrspace(1) %c.gep
%fma = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
%fneg = fneg <4 x float> %fma
store <4 x float> %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_fmad_v4f32_nsz(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: v_fneg_fmad_v4f32_nsz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v12, 4, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v12
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v2, s5
; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v12
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v2, vcc
; SI-NEXT: v_mov_b32_e32 v2, s7
; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v12
; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v2, vcc
; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v13, s1
; SI-NEXT: v_add_i32_e32 v12, vcc, s0, v12
; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc
; SI-NEXT: v_mad_f32 v3, v3, -v7, -v11
; SI-NEXT: v_mad_f32 v2, v2, -v6, -v10
; SI-NEXT: v_mad_f32 v1, v1, -v5, -v9
; SI-NEXT: v_mad_f32 v0, v0, -v4, -v8
; SI-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fmad_v4f32_nsz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v12, 4, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v12
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v12
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v2, vcc
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v8, vcc, s6, v12
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v13, s1
; VI-NEXT: v_add_u32_e32 v12, vcc, s0, v12
; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc
; VI-NEXT: v_mad_f32 v3, v3, -v7, -v11
; VI-NEXT: v_mad_f32 v2, v2, -v6, -v10
; VI-NEXT: v_mad_f32 v1, v1, -v5, -v9
; VI-NEXT: v_mad_f32 v0, v0, -v4, -v8
; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile <4 x float>, ptr addrspace(1) %a.gep
%b = load volatile <4 x float>, ptr addrspace(1) %b.gep
%c = load volatile <4 x float>, ptr addrspace(1) %c.gep
%fma = call nsz <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
%fneg = fneg <4 x float> %fma
store <4 x float> %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_fmad_multi_use_fmad_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: v_fneg_fmad_multi_use_fmad_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s7
; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v6, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v3, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_mac_f32_e32 v3, v6, v2
; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v3
; SI-NEXT: v_mul_f32_e32 v3, 4.0, v3
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fmad_multi_use_fmad_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v6, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v3, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mac_f32_e32 v3, v6, v2
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v3
; VI-NEXT: v_mul_f32_e32 v3, 4.0, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
%fneg = fneg float %fma
%use1 = fmul float %fma, 4.0
store volatile float %fneg, ptr addrspace(1) %out
store volatile float %use1, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_fmad_multi_use_fmad_f32_nsz(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: v_fneg_fmad_multi_use_fmad_f32_nsz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s7
; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v6, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v3, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_mac_f32_e32 v3, v6, v2
; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v3
; SI-NEXT: v_mul_f32_e32 v3, 4.0, v3
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fmad_multi_use_fmad_f32_nsz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v6, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v3, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mac_f32_e32 v3, v6, v2
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v3
; VI-NEXT: v_mul_f32_e32 v3, 4.0, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
%fneg = fneg float %fma
%use1 = fmul float %fma, 4.0
store volatile float %fneg, ptr addrspace(1) %out
store volatile float %use1, ptr addrspace(1) %out
ret void
}
; --------------------------------------------------------------------------------
; fp_extend tests
; --------------------------------------------------------------------------------
define amdgpu_kernel void @v_fneg_fp_extend_f32_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_fp_extend_f32_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s3
; SI-NEXT: v_add_i32_e32 v1, vcc, s2, v1
; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; SI-NEXT: flat_load_dword v1, v[1:2] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; SI-NEXT: v_mov_b32_e32 v3, s1
; SI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_cvt_f64_f32_e64 v[0:1], -v1
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fp_extend_f32_to_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dword v1, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_cvt_f64_f32_e64 v[0:1], -v1
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%fpext = fpext float %a to double
%fneg = fsub double -0.000000e+00, %fpext
store double %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_fp_extend_fneg_f32_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_fp_extend_fneg_f32_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s3
; SI-NEXT: v_add_i32_e32 v1, vcc, s2, v1
; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; SI-NEXT: flat_load_dword v1, v[1:2] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; SI-NEXT: v_mov_b32_e32 v3, s1
; SI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fp_extend_fneg_f32_to_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dword v1, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%fneg.a = fneg float %a
%fpext = fpext float %fneg.a to double
%fneg = fsub double -0.000000e+00, %fpext
store double %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_fp_extend_store_use_fneg_f32_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s3
; SI-NEXT: v_add_i32_e32 v1, vcc, s2, v1
; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; SI-NEXT: flat_load_dword v4, v[1:2] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; SI-NEXT: v_mov_b32_e32 v3, s1
; SI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
; SI-NEXT: v_xor_b32_e32 v4, 0x80000000, v4
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v4
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fp_extend_store_use_fneg_f32_to_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dword v4, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
; VI-NEXT: v_xor_b32_e32 v4, 0x80000000, v4
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v4
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%fneg.a = fneg float %a
%fpext = fpext float %fneg.a to double
%fneg = fsub double -0.000000e+00, %fpext
store volatile double %fneg, ptr addrspace(1) %out.gep
store volatile float %fneg.a, ptr addrspace(1) poison
ret void
}
define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_multi_use_fp_extend_fneg_f32_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s3
; SI-NEXT: v_add_i32_e32 v1, vcc, s2, v1
; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; SI-NEXT: flat_load_dword v1, v[1:2] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; SI-NEXT: v_mov_b32_e32 v3, s1
; SI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
; SI-NEXT: v_xor_b32_e32 v5, 0x80000000, v1
; SI-NEXT: v_mov_b32_e32 v4, v0
; SI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_multi_use_fp_extend_fneg_f32_to_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dword v1, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
; VI-NEXT: v_xor_b32_e32 v5, 0x80000000, v1
; VI-NEXT: v_mov_b32_e32 v4, v0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%fpext = fpext float %a to double
%fneg = fsub double -0.000000e+00, %fpext
store volatile double %fneg, ptr addrspace(1) %out.gep
store volatile double %fpext, ptr addrspace(1) poison
ret void
}
define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s3
; SI-NEXT: v_add_i32_e32 v1, vcc, s2, v1
; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; SI-NEXT: flat_load_dword v1, v[1:2] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; SI-NEXT: v_mov_b32_e32 v3, s1
; SI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
; SI-NEXT: v_xor_b32_e32 v5, 0x80000000, v1
; SI-NEXT: v_mov_b32_e32 v4, v0
; SI-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; SI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dword v1, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v6, s1
; VI-NEXT: v_add_u32_e32 v5, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
; VI-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; VI-NEXT: v_mul_f64 v[3:4], v[1:2], 4.0
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; VI-NEXT: flat_store_dwordx2 v[5:6], v[1:2]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[5:6], v[3:4]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%fpext = fpext float %a to double
%fneg = fsub double -0.000000e+00, %fpext
%mul = fmul double %fpext, 4.0
store volatile double %fneg, ptr addrspace(1) %out.gep
store volatile double %mul, ptr addrspace(1) %out.gep
ret void
}
; FIXME: Source modifiers not folded for f16->f32
define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s3
; SI-NEXT: v_add_i32_e32 v1, vcc, s2, v1
; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; SI-NEXT: flat_load_ushort v1, v[1:2] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; SI-NEXT: v_cvt_f32_f16_e64 v4, -v1
; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; SI-NEXT: flat_store_dword v[0:1], v4
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_ushort v1, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_cvt_f32_f16_e32 v3, v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds half, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile half, ptr addrspace(1) %a.gep
%fpext = fpext half %a to float
%fneg = fneg float %fpext
store volatile float %fneg, ptr addrspace(1) %out.gep
store volatile float %fpext, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s3
; SI-NEXT: v_add_i32_e32 v1, vcc, s2, v1
; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; SI-NEXT: flat_load_ushort v1, v[1:2] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
; SI-NEXT: v_cvt_f32_f16_e64 v4, -v1
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; SI-NEXT: v_mul_f32_e32 v2, 4.0, v3
; SI-NEXT: flat_store_dword v[0:1], v4
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_ushort v1, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_cvt_f32_f16_e32 v3, v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v3
; VI-NEXT: v_mul_f32_e32 v3, 4.0, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds half, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile half, ptr addrspace(1) %a.gep
%fpext = fpext half %a to float
%fneg = fneg float %fpext
%mul = fmul float %fpext, 4.0
store volatile float %fneg, ptr addrspace(1) %out.gep
store volatile float %mul, ptr addrspace(1) %out.gep
ret void
}
; --------------------------------------------------------------------------------
; fp_round tests
; --------------------------------------------------------------------------------
define amdgpu_kernel void @v_fneg_fp_round_f64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_fp_round_f64_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s3
; SI-NEXT: v_add_i32_e32 v1, vcc, s2, v1
; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; SI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v3, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; SI-NEXT: v_cvt_f32_f64_e64 v2, -v[1:2]
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fp_round_f64_to_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_cvt_f32_f64_e64 v2, -v[1:2]
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile double, ptr addrspace(1) %a.gep
%fpround = fptrunc double %a to float
%fneg = fneg float %fpround
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_fp_round_fneg_f64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_fp_round_fneg_f64_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s3
; SI-NEXT: v_add_i32_e32 v1, vcc, s2, v1
; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; SI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v3, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; SI-NEXT: v_cvt_f32_f64_e32 v2, v[1:2]
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fp_round_fneg_f64_to_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_cvt_f32_f64_e32 v2, v[1:2]
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile double, ptr addrspace(1) %a.gep
%fneg.a = fsub nsz double -0.000000e+00, %a
%fpround = fptrunc double %fneg.a to float
%fneg = fneg float %fpround
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_fp_round_store_use_fneg_f64_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s3
; SI-NEXT: v_add_i32_e32 v1, vcc, s2, v1
; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; SI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v4, s1
; SI-NEXT: v_add_i32_e32 v3, vcc, s0, v0
; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; SI-NEXT: v_cvt_f32_f64_e32 v5, v[1:2]
; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; SI-NEXT: flat_store_dword v[3:4], v5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dwordx2 v[0:1], v[1:2]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fp_round_store_use_fneg_f64_to_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, s1
; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; VI-NEXT: v_cvt_f32_f64_e32 v5, v[1:2]
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; VI-NEXT: flat_store_dword v[3:4], v5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[0:1], v[1:2]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile double, ptr addrspace(1) %a.gep
%fneg.a = fsub double -0.000000e+00, %a
%fpround = fptrunc double %fneg.a to float
%fneg = fneg float %fpround
store volatile float %fneg, ptr addrspace(1) %out.gep
store volatile double %fneg.a, ptr addrspace(1) poison
ret void
}
define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, double %c) #0 {
; SI-LABEL: v_fneg_fp_round_multi_use_fneg_f64_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s3
; SI-NEXT: v_add_i32_e32 v1, vcc, s2, v1
; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; SI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v4, s1
; SI-NEXT: v_add_i32_e32 v3, vcc, s0, v0
; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; SI-NEXT: v_cvt_f32_f64_e32 v5, v[1:2]
; SI-NEXT: v_mul_f64 v[0:1], -v[1:2], s[4:5]
; SI-NEXT: flat_store_dword v[3:4], v5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fp_round_multi_use_fneg_f64_to_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_mul_f64 v[3:4], -v[1:2], s[4:5]
; VI-NEXT: v_cvt_f32_f64_e32 v2, v[1:2]
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[0:1], v[3:4]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile double, ptr addrspace(1) %a.gep
%fneg.a = fsub double -0.000000e+00, %a
%fpround = fptrunc double %fneg.a to float
%fneg = fneg float %fpround
%use1 = fmul double %fneg.a, %c
store volatile float %fneg, ptr addrspace(1) %out.gep
store volatile double %use1, ptr addrspace(1) poison
ret void
}
define amdgpu_kernel void @v_fneg_fp_round_f32_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_fp_round_f32_to_f16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s3
; SI-NEXT: v_add_i32_e32 v1, vcc, s2, v1
; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; SI-NEXT: flat_load_dword v1, v[1:2] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; SI-NEXT: v_cvt_f16_f32_e64 v3, -v1
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; SI-NEXT: flat_store_short v[0:1], v3
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fp_round_f32_to_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dword v1, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_cvt_f16_f32_e64 v3, -v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; VI-NEXT: flat_store_short v[0:1], v3
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%fpround = fptrunc float %a to half
%fneg = fsub half -0.000000e+00, %fpround
store half %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_fp_round_fneg_f32_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_fp_round_fneg_f32_to_f16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s3
; SI-NEXT: v_add_i32_e32 v1, vcc, s2, v1
; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; SI-NEXT: flat_load_dword v1, v[1:2] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v3, v1
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; SI-NEXT: flat_store_short v[0:1], v3
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fp_round_fneg_f32_to_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dword v1, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_cvt_f16_f32_e32 v3, v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; VI-NEXT: flat_store_short v[0:1], v3
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%fneg.a = fneg float %a
%fpround = fptrunc float %fneg.a to half
%fneg = fsub half -0.000000e+00, %fpround
store half %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_multi_use_fp_round_fneg_f64_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s3
; SI-NEXT: v_add_i32_e32 v1, vcc, s2, v1
; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; SI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v3, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; SI-NEXT: v_cvt_f32_f64_e32 v2, v[1:2]
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; SI-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_multi_use_fp_round_fneg_f64_to_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_cvt_f32_f64_e32 v2, v[1:2]
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile double, ptr addrspace(1) %a.gep
%fpround = fptrunc double %a to float
%fneg = fneg float %fpround
store volatile float %fneg, ptr addrspace(1) %out.gep
store volatile float %fpround, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f32_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_fp_round_store_use_fneg_f32_to_f16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s3
; SI-NEXT: v_add_i32_e32 v1, vcc, s2, v1
; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; SI-NEXT: flat_load_dword v2, v[1:2] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_cvt_f16_f32_e32 v3, v2
; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; SI-NEXT: flat_store_short v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fp_round_store_use_fneg_f32_to_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dword v2, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_cvt_f16_f32_e32 v3, v2
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; VI-NEXT: flat_store_short v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%fneg.a = fneg float %a
%fpround = fptrunc float %fneg.a to half
%fneg = fsub half -0.000000e+00, %fpround
store volatile half %fneg, ptr addrspace(1) %out.gep
store volatile float %fneg.a, ptr addrspace(1) poison
ret void
}
define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, float %c) #0 {
; SI-LABEL: v_fneg_fp_round_multi_use_fneg_f32_to_f16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dword s4, s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s3
; SI-NEXT: v_add_i32_e32 v1, vcc, s2, v1
; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; SI-NEXT: flat_load_dword v2, v[1:2] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_cvt_f16_f32_e32 v3, v2
; SI-NEXT: v_mul_f32_e64 v2, -v2, s4
; SI-NEXT: flat_store_short v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_fp_round_multi_use_fneg_f32_to_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dword s4, s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dword v2, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_cvt_f16_f32_e32 v3, v2
; VI-NEXT: v_mul_f32_e64 v2, -v2, s4
; VI-NEXT: flat_store_short v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%fneg.a = fneg float %a
%fpround = fptrunc float %fneg.a to half
%fneg = fsub half -0.000000e+00, %fpround
%use1 = fmul float %fneg.a, %c
store volatile half %fneg, ptr addrspace(1) %out.gep
store volatile float %use1, ptr addrspace(1) poison
ret void
}
; --------------------------------------------------------------------------------
; rcp tests
; --------------------------------------------------------------------------------
define amdgpu_kernel void @v_fneg_rcp_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_rcp_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_rcp_f32_e64 v3, -v0
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_rcp_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_rcp_f32_e64 v3, -v0
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%rcp = call float @llvm.amdgcn.rcp.f32(float %a)
%fneg = fneg float %rcp
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_rcp_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_rcp_fneg_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_rcp_f32_e32 v3, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_rcp_fneg_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_rcp_f32_e32 v3, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%fneg.a = fneg float %a
%rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
%fneg = fneg float %rcp
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_rcp_store_use_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_rcp_store_use_fneg_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_dword v3, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_rcp_f32_e32 v4, v3
; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v3
; SI-NEXT: flat_store_dword v[0:1], v4
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_rcp_store_use_fneg_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_rcp_f32_e32 v4, v3
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v3
; VI-NEXT: flat_store_dword v[0:1], v4
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%fneg.a = fneg float %a
%rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
%fneg = fneg float %rcp
store volatile float %fneg, ptr addrspace(1) %out.gep
store volatile float %fneg.a, ptr addrspace(1) poison
ret void
}
define amdgpu_kernel void @v_fneg_rcp_multi_use_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, float %c) #0 {
; SI-LABEL: v_fneg_rcp_multi_use_fneg_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dword s4, s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_dword v3, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_rcp_f32_e32 v4, v3
; SI-NEXT: v_mul_f32_e64 v2, -v3, s4
; SI-NEXT: flat_store_dword v[0:1], v4
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_rcp_multi_use_fneg_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dword s4, s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_rcp_f32_e32 v4, v3
; VI-NEXT: v_mul_f32_e64 v2, -v3, s4
; VI-NEXT: flat_store_dword v[0:1], v4
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%fneg.a = fneg float %a
%rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
%fneg = fneg float %rcp
%use1 = fmul float %fneg.a, %c
store volatile float %fneg, ptr addrspace(1) %out.gep
store volatile float %use1, ptr addrspace(1) poison
ret void
}
; --------------------------------------------------------------------------------
; fmul_legacy tests
; --------------------------------------------------------------------------------
define amdgpu_kernel void @v_fneg_mul_legacy_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_mul_legacy_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v5, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mul_legacy_f32_e64 v2, v5, -v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_mul_legacy_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mul_legacy_f32_e64 v2, v5, -v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
%fneg = fneg float %mul
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_mul_legacy_store_use_mul_legacy_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v4, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_mul_legacy_f32_e32 v2, v4, v2
; SI-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_mul_legacy_store_use_mul_legacy_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mul_legacy_f32_e32 v2, v4, v2
; VI-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
%fneg = fneg float %mul
store volatile float %fneg, ptr addrspace(1) %out
store volatile float %mul, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_mul_legacy_multi_use_mul_legacy_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v4, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_mul_legacy_f32_e64 v2, v4, -v2
; SI-NEXT: v_mul_legacy_f32_e64 v3, -v2, 4.0
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_mul_legacy_multi_use_mul_legacy_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mul_legacy_f32_e64 v2, v4, -v2
; VI-NEXT: v_mul_legacy_f32_e64 v3, -v2, 4.0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
%fneg = fneg float %mul
%use1 = call float @llvm.amdgcn.fmul.legacy(float %mul, float 4.0)
store volatile float %fneg, ptr addrspace(1) %out
store volatile float %use1, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_mul_legacy_fneg_x_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v1, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_legacy_f32_e32 v2, v0, v1
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_mul_legacy_fneg_x_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_legacy_f32_e32 v2, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%fneg.a = fneg float %a
%mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
%fneg = fneg float %mul
store volatile float %fneg, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_mul_legacy_x_fneg_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v1, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_legacy_f32_e32 v2, v0, v1
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_mul_legacy_x_fneg_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_legacy_f32_e32 v2, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%fneg.b = fneg float %b
%mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %fneg.b)
%fneg = fneg float %mul
store volatile float %fneg, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_mul_legacy_fneg_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_mul_legacy_fneg_fneg_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v1, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_legacy_f32_e64 v2, v0, -v1
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_mul_legacy_fneg_fneg_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_legacy_f32_e64 v2, v0, -v1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%fneg.a = fneg float %a
%fneg.b = fneg float %b
%mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %fneg.b)
%fneg = fneg float %mul
store volatile float %fneg, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_mul_legacy_store_use_fneg_x_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v4, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_xor_b32_e32 v3, 0x80000000, v4
; SI-NEXT: v_mul_legacy_f32_e32 v2, v4, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_mul_legacy_store_use_fneg_x_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_xor_b32_e32 v3, 0x80000000, v4
; VI-NEXT: v_mul_legacy_f32_e32 v2, v4, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%fneg.a = fneg float %a
%mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
%fneg = fneg float %mul
store volatile float %fneg, ptr addrspace(1) %out
store volatile float %fneg.a, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, float %c) #0 {
; SI-LABEL: v_fneg_mul_legacy_multi_use_fneg_x_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s7
; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v4, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_load_dword s2, s[4:5], 0xf
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mul_legacy_f32_e64 v3, -v4, s2
; SI-NEXT: v_mul_legacy_f32_e32 v2, v4, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_mul_legacy_multi_use_fneg_x_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_load_dword s2, s[4:5], 0x3c
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mul_legacy_f32_e64 v3, -v4, s2
; VI-NEXT: v_mul_legacy_f32_e32 v2, v4, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%fneg.a = fneg float %a
%mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
%fneg = fneg float %mul
%use1 = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %c)
store volatile float %fneg, ptr addrspace(1) %out
store volatile float %use1, ptr addrspace(1) %out
ret void
}
; --------------------------------------------------------------------------------
; sin tests
; --------------------------------------------------------------------------------
define amdgpu_kernel void @v_fneg_sin_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_sin_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_mul_f32_e32 v0, 0xbe22f983, v0
; SI-NEXT: v_fract_f32_e32 v0, v0
; SI-NEXT: v_sin_f32_e32 v3, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_sin_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mul_f32_e32 v0, 0xbe22f983, v0
; VI-NEXT: v_fract_f32_e32 v0, v0
; VI-NEXT: v_sin_f32_e32 v3, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%sin = call float @llvm.sin.f32(float %a)
%fneg = fneg float %sin
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_amdgcn_sin_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_amdgcn_sin_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_sin_f32_e64 v3, -v0
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_amdgcn_sin_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_sin_f32_e64 v3, -v0
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%sin = call float @llvm.amdgcn.sin.f32(float %a)
%fneg = fneg float %sin
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
; --------------------------------------------------------------------------------
; ftrunc tests
; --------------------------------------------------------------------------------
define amdgpu_kernel void @v_fneg_trunc_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_trunc_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_dword v3, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_trunc_f32_e64 v2, -v3
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_trunc_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_trunc_f32_e64 v2, -v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%trunc = call float @llvm.trunc.f32(float %a)
%fneg = fneg float %trunc
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
; --------------------------------------------------------------------------------
; fround tests
; --------------------------------------------------------------------------------
define amdgpu_kernel void @v_fneg_round_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_round_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_dword v3, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_trunc_f32_e32 v2, v3
; SI-NEXT: v_sub_f32_e32 v4, v3, v2
; SI-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5
; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1]
; SI-NEXT: s_brev_b32 s0, -2
; SI-NEXT: v_bfi_b32 v3, s0, v4, v3
; SI-NEXT: v_add_f32_e32 v2, v2, v3
; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_round_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_trunc_f32_e32 v2, v3
; VI-NEXT: v_sub_f32_e32 v4, v3, v2
; VI-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5
; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1]
; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: v_bfi_b32 v3, s0, v4, v3
; VI-NEXT: v_add_f32_e32 v2, v2, v3
; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%round = call float @llvm.round.f32(float %a)
%fneg = fneg float %round
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_round_f32_nsz(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_round_f32_nsz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_dword v3, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_trunc_f32_e32 v2, v3
; SI-NEXT: v_sub_f32_e32 v4, v3, v2
; SI-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5
; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1]
; SI-NEXT: s_brev_b32 s0, -2
; SI-NEXT: v_bfi_b32 v3, s0, v4, v3
; SI-NEXT: v_sub_f32_e64 v2, -v2, v3
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_round_f32_nsz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_trunc_f32_e32 v2, v3
; VI-NEXT: v_sub_f32_e32 v4, v3, v2
; VI-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5
; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1]
; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: v_bfi_b32 v3, s0, v4, v3
; VI-NEXT: v_sub_f32_e64 v2, -v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%round = call float @llvm.round.f32(float %a)
%fneg = fneg nsz float %round
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
; --------------------------------------------------------------------------------
; rint tests
; --------------------------------------------------------------------------------
define amdgpu_kernel void @v_fneg_rint_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_rint_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_dword v3, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_rndne_f32_e64 v2, -v3
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_rint_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_rndne_f32_e64 v2, -v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%rint = call float @llvm.rint.f32(float %a)
%fneg = fneg float %rint
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
; --------------------------------------------------------------------------------
; nearbyint tests
; --------------------------------------------------------------------------------
define amdgpu_kernel void @v_fneg_nearbyint_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_nearbyint_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_dword v3, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_rndne_f32_e64 v2, -v3
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_nearbyint_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_rndne_f32_e64 v2, -v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%nearbyint = call float @llvm.nearbyint.f32(float %a)
%fneg = fneg float %nearbyint
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
; --------------------------------------------------------------------------------
; fcanonicalize tests
; --------------------------------------------------------------------------------
define amdgpu_kernel void @v_fneg_canonicalize_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
; SI-LABEL: v_fneg_canonicalize_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: flat_load_dword v3, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mul_f32_e32 v2, -1.0, v3
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_canonicalize_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mul_f32_e32 v2, -1.0, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%trunc = call float @llvm.canonicalize.f32(float %a)
%fneg = fneg float %trunc
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
; --------------------------------------------------------------------------------
; vintrp tests
; --------------------------------------------------------------------------------
define amdgpu_kernel void @v_fneg_interp_p1_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_interp_p1_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_mov_b32 m0, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v5, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mul_f32_e64 v2, v5, -v2
; SI-NEXT: v_interp_p1_f32 v3, v2, attr0.x
; SI-NEXT: v_interp_p1_f32 v2, v2, attr0.y
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_interp_p1_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_mov_b32 m0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mul_f32_e64 v2, v5, -v2
; VI-NEXT: v_interp_p1_f32_e32 v3, v2, attr0.x
; VI-NEXT: v_interp_p1_f32_e32 v2, v2, attr0.y
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%mul = fmul float %a, %b
%fneg = fneg float %mul
%intrp0 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 0, i32 0, i32 0)
%intrp1 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 1, i32 0, i32 0)
store volatile float %intrp0, ptr addrspace(1) %out.gep
store volatile float %intrp1, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_interp_p2_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_fneg_interp_p2_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: v_mov_b32_e32 v6, 4.0
; SI-NEXT: s_mov_b32 m0, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: flat_load_dword v5, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
; SI-NEXT: v_mov_b32_e32 v3, 4.0
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mul_f32_e64 v2, v5, -v2
; SI-NEXT: v_interp_p2_f32 v6, v2, attr0.x
; SI-NEXT: v_interp_p2_f32 v3, v2, attr0.y
; SI-NEXT: flat_store_dword v[0:1], v6
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_interp_p2_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: v_mov_b32_e32 v6, 4.0
; VI-NEXT: s_mov_b32 m0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_mov_b32_e32 v3, 4.0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mul_f32_e64 v2, v5, -v2
; VI-NEXT: v_interp_p2_f32_e32 v6, v2, attr0.x
; VI-NEXT: v_interp_p2_f32_e32 v3, v2, attr0.y
; VI-NEXT: flat_store_dword v[0:1], v6
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%mul = fmul float %a, %b
%fneg = fneg float %mul
%intrp0 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 0, i32 0, i32 0)
%intrp1 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 1, i32 0, i32 0)
store volatile float %intrp0, ptr addrspace(1) %out.gep
store volatile float %intrp1, ptr addrspace(1) %out.gep
ret void
}
; --------------------------------------------------------------------------------
; CopyToReg tests
; --------------------------------------------------------------------------------
define amdgpu_kernel void @v_fneg_copytoreg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr, i32 %d) #0 {
; SI-LABEL: v_fneg_copytoreg_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; SI-NEXT: s_load_dword s0, s[4:5], 0x11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s11
; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v6
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s13
; SI-NEXT: v_add_i32_e32 v2, vcc, s12, v6
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s15
; SI-NEXT: v_add_i32_e32 v4, vcc, s14, v6
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v7, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v3, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: v_add_i32_e32 v0, vcc, s8, v6
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: v_mul_f32_e32 v3, v7, v3
; SI-NEXT: s_cbranch_scc0 .LBB125_2
; SI-NEXT: ; %bb.1: ; %endif
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
; SI-NEXT: .LBB125_2: ; %if
; SI-NEXT: v_xor_b32_e32 v4, 0x80000000, v3
; SI-NEXT: v_mul_f32_e32 v2, v4, v2
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_copytoreg_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-NEXT: s_load_dword s0, s[4:5], 0x44
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s11
; VI-NEXT: v_add_u32_e32 v0, vcc, s10, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s13
; VI-NEXT: v_add_u32_e32 v2, vcc, s12, v6
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s15
; VI-NEXT: v_add_u32_e32 v4, vcc, s14, v6
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v7, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v3, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_add_u32_e32 v0, vcc, s8, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: v_mul_f32_e32 v3, v7, v3
; VI-NEXT: s_cbranch_scc0 .LBB125_2
; VI-NEXT: ; %bb.1: ; %endif
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB125_2: ; %if
; VI-NEXT: v_xor_b32_e32 v4, 0x80000000, v3
; VI-NEXT: v_mul_f32_e32 v2, v4, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%mul = fmul float %a, %b
%fneg = fneg float %mul
%cmp0 = icmp eq i32 %d, 0
br i1 %cmp0, label %if, label %endif
if:
%mul1 = fmul float %fneg, %c
store volatile float %mul1, ptr addrspace(1) %out.gep
br label %endif
endif:
store volatile float %mul, ptr addrspace(1) %out.gep
ret void
}
; --------------------------------------------------------------------------------
; inlineasm tests
; --------------------------------------------------------------------------------
; Can't fold into use, so should fold into source
define amdgpu_kernel void @v_fneg_inlineasm_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr, i32 %d) #0 {
; SI-LABEL: v_fneg_inlineasm_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s7
; SI-NEXT: flat_load_dword v6, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mul_f32_e64 v2, v6, -v2
; SI-NEXT: ;;#ASMSTART
; SI-NEXT: ; use v2
; SI-NEXT: ;;#ASMEND
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_inlineasm_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: flat_load_dword v6, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mul_f32_e64 v2, v6, -v2
; VI-NEXT: ;;#ASMSTART
; VI-NEXT: ; use v2
; VI-NEXT: ;;#ASMEND
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%mul = fmul float %a, %b
%fneg = fneg float %mul
call void asm sideeffect "; use $0", "v"(float %fneg)
store volatile float %fneg, ptr addrspace(1) %out.gep
ret void
}
; --------------------------------------------------------------------------------
; inlineasm tests
; --------------------------------------------------------------------------------
; Can't fold into use, so should fold into source
define amdgpu_kernel void @v_fneg_inlineasm_multi_use_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr, i32 %d) #0 {
; SI-LABEL: v_fneg_inlineasm_multi_use_src_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s7
; SI-NEXT: flat_load_dword v6, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mul_f32_e32 v2, v6, v2
; SI-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
; SI-NEXT: ;;#ASMSTART
; SI-NEXT: ; use v3
; SI-NEXT: ;;#ASMEND
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_inlineasm_multi_use_src_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: flat_load_dword v6, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mul_f32_e32 v2, v6, v2
; VI-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
; VI-NEXT: ;;#ASMSTART
; VI-NEXT: ; use v3
; VI-NEXT: ;;#ASMEND
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%mul = fmul float %a, %b
%fneg = fneg float %mul
call void asm sideeffect "; use $0", "v"(float %fneg)
store volatile float %mul, ptr addrspace(1) %out.gep
ret void
}
; --------------------------------------------------------------------------------
; code size regression tests
; --------------------------------------------------------------------------------
; There are multiple users of the fneg that must use a VOP3
; instruction, so there is no penalty
define amdgpu_kernel void @multiuse_fneg_2_vop3_users_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: multiuse_fneg_2_vop3_users_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s7
; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v6, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v3, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_fma_f32 v2, -v6, v2, v3
; SI-NEXT: v_fma_f32 v3, -v6, v3, 2.0
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: multiuse_fneg_2_vop3_users_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v6, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v3, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_fma_f32 v2, -v6, v2, v3
; VI-NEXT: v_fma_f32 v3, -v6, v3, 2.0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%fneg.a = fneg float %a
%fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
%fma1 = call float @llvm.fma.f32(float %fneg.a, float %c, float 2.0)
store volatile float %fma0, ptr addrspace(1) %out
store volatile float %fma1, ptr addrspace(1) %out
ret void
}
; There are multiple users, but both require using a larger encoding
; for the modifier.
define amdgpu_kernel void @multiuse_fneg_2_vop2_users_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: multiuse_fneg_2_vop2_users_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s7
; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v6, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v3, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_mul_f32_e64 v2, -v6, v2
; SI-NEXT: v_mul_f32_e64 v3, -v6, v3
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: multiuse_fneg_2_vop2_users_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v6, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v3, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mul_f32_e64 v2, -v6, v2
; VI-NEXT: v_mul_f32_e64 v3, -v6, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%fneg.a = fneg float %a
%mul0 = fmul float %fneg.a, %b
%mul1 = fmul float %fneg.a, %c
store volatile float %mul0, ptr addrspace(1) %out
store volatile float %mul1, ptr addrspace(1) %out
ret void
}
; One user is VOP3 so has no cost to folding the modifier, the other does.
define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
; SI-LABEL: multiuse_fneg_vop2_vop3_users_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s7
; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v6, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v3, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_fma_f32 v2, -v6, v2, 2.0
; SI-NEXT: v_mul_f32_e64 v3, -v6, v3
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: multiuse_fneg_vop2_vop3_users_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v6, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v3, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_fma_f32 v2, -v6, v2, 2.0
; VI-NEXT: v_mul_f32_e64 v3, -v6, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%fneg.a = fneg float %a
%fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float 2.0)
%mul1 = fmul float %fneg.a, %c
store volatile float %fma0, ptr addrspace(1) %out
store volatile float %mul1, ptr addrspace(1) %out
ret void
}
; The use of the fneg requires a code size increase, but folding into
; the source does not
define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr, ptr addrspace(1) %d.ptr) #0 {
; SI-LABEL: free_fold_src_code_size_cost_use_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11
; SI-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s11
; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v6
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s13
; SI-NEXT: v_add_i32_e32 v2, vcc, s12, v6
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s15
; SI-NEXT: v_add_i32_e32 v4, vcc, s14, v6
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: v_mov_b32_e32 v7, s1
; SI-NEXT: v_add_i32_e32 v6, vcc, s0, v6
; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; SI-NEXT: flat_load_dword v8, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v3, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v4, v[6:7] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: v_fma_f32 v2, v8, v2, 2.0
; SI-NEXT: v_mul_f32_e64 v3, -v2, v3
; SI-NEXT: v_mul_f32_e64 v2, -v2, v4
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: free_fold_src_code_size_cost_use_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44
; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s11
; VI-NEXT: v_add_u32_e32 v0, vcc, s10, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s13
; VI-NEXT: v_add_u32_e32 v2, vcc, s12, v6
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s15
; VI-NEXT: v_add_u32_e32 v4, vcc, s14, v6
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_add_u32_e32 v6, vcc, s0, v6
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; VI-NEXT: flat_load_dword v8, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v3, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v4, v[6:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_fma_f32 v2, v8, v2, 2.0
; VI-NEXT: v_mul_f32_e64 v3, -v2, v3
; VI-NEXT: v_mul_f32_e64 v2, -v2, v4
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%d.gep = getelementptr inbounds float, ptr addrspace(1) %d.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%d = load volatile float, ptr addrspace(1) %d.gep
%fma0 = call float @llvm.fma.f32(float %a, float %b, float 2.0)
%fneg.fma0 = fneg float %fma0
%mul1 = fmul float %fneg.fma0, %c
%mul2 = fmul float %fneg.fma0, %d
store volatile float %mul1, ptr addrspace(1) %out
store volatile float %mul2, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32_nsz(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr, ptr addrspace(1) %d.ptr) #0 {
; SI-LABEL: free_fold_src_code_size_cost_use_f32_nsz:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11
; SI-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s11
; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v6
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s13
; SI-NEXT: v_add_i32_e32 v2, vcc, s12, v6
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s15
; SI-NEXT: v_add_i32_e32 v4, vcc, s14, v6
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: v_mov_b32_e32 v7, s1
; SI-NEXT: v_add_i32_e32 v6, vcc, s0, v6
; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; SI-NEXT: flat_load_dword v8, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v3, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v4, v[6:7] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: v_fma_f32 v2, v8, -v2, -2.0
; SI-NEXT: v_mul_f32_e32 v3, v2, v3
; SI-NEXT: v_mul_f32_e32 v2, v2, v4
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: free_fold_src_code_size_cost_use_f32_nsz:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44
; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s11
; VI-NEXT: v_add_u32_e32 v0, vcc, s10, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s13
; VI-NEXT: v_add_u32_e32 v2, vcc, s12, v6
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s15
; VI-NEXT: v_add_u32_e32 v4, vcc, s14, v6
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_add_u32_e32 v6, vcc, s0, v6
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; VI-NEXT: flat_load_dword v8, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v3, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v4, v[6:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_fma_f32 v2, v8, -v2, -2.0
; VI-NEXT: v_mul_f32_e32 v3, v2, v3
; VI-NEXT: v_mul_f32_e32 v2, v2, v4
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%d.gep = getelementptr inbounds float, ptr addrspace(1) %d.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%d = load volatile float, ptr addrspace(1) %d.gep
%fma0 = call nsz float @llvm.fma.f32(float %a, float %b, float 2.0)
%fneg.fma0 = fneg float %fma0
%mul1 = fmul float %fneg.fma0, %c
%mul2 = fmul float %fneg.fma0, %d
store volatile float %mul1, ptr addrspace(1) %out
store volatile float %mul2, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @free_fold_src_code_size_cost_use_f64(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr, ptr addrspace(1) %d.ptr) #0 {
; SI-LABEL: free_fold_src_code_size_cost_use_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11
; SI-NEXT: v_lshlrev_b32_e32 v6, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s11
; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v6
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s13
; SI-NEXT: v_add_i32_e32 v2, vcc, s12, v6
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s15
; SI-NEXT: v_add_i32_e32 v4, vcc, s14, v6
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v7, s1
; SI-NEXT: v_add_i32_e32 v6, vcc, s0, v6
; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; SI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dwordx2 v[6:7], v[6:7] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], 2.0
; SI-NEXT: v_mov_b32_e32 v2, s8
; SI-NEXT: v_mov_b32_e32 v3, s9
; SI-NEXT: v_mul_f64 v[4:5], -v[0:1], v[4:5]
; SI-NEXT: v_mul_f64 v[0:1], -v[0:1], v[6:7]
; SI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: free_fold_src_code_size_cost_use_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44
; VI-NEXT: v_lshlrev_b32_e32 v6, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s11
; VI-NEXT: v_add_u32_e32 v0, vcc, s10, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s13
; VI-NEXT: v_add_u32_e32 v2, vcc, s12, v6
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, s15
; VI-NEXT: v_add_u32_e32 v4, vcc, s14, v6
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_add_u32_e32 v6, vcc, s0, v6
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dwordx2 v[6:7], v[6:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], 2.0
; VI-NEXT: v_mul_f64 v[2:3], -v[0:1], v[4:5]
; VI-NEXT: v_mul_f64 v[0:1], -v[0:1], v[6:7]
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: flat_store_dwordx2 v[4:5], v[2:3]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds double, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds double, ptr addrspace(1) %c.ptr, i64 %tid.ext
%d.gep = getelementptr inbounds double, ptr addrspace(1) %d.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile double, ptr addrspace(1) %a.gep
%b = load volatile double, ptr addrspace(1) %b.gep
%c = load volatile double, ptr addrspace(1) %c.gep
%d = load volatile double, ptr addrspace(1) %d.gep
%fma0 = call double @llvm.fma.f64(double %a, double %b, double 2.0)
%fneg.fma0 = fsub double -0.0, %fma0
%mul1 = fmul double %fneg.fma0, %c
%mul2 = fmul double %fneg.fma0, %d
store volatile double %mul1, ptr addrspace(1) %out
store volatile double %mul2, ptr addrspace(1) %out
ret void
}
; %trunc.a has one fneg use, but it requires a code size increase and
; %the fneg can instead be folded for free into the fma.
define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr, ptr addrspace(1) %d.ptr) #0 {
; SI-LABEL: one_use_cost_to_fold_into_src_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11
; SI-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s11
; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v6
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s13
; SI-NEXT: v_add_i32_e32 v2, vcc, s12, v6
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s15
; SI-NEXT: v_add_i32_e32 v4, vcc, s14, v6
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: flat_load_dword v8, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v3, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v7, s1
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v6
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
; SI-NEXT: flat_load_dword v0, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_trunc_f32_e32 v0, v8
; SI-NEXT: v_fma_f32 v2, -v0, v2, v3
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: one_use_cost_to_fold_into_src_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44
; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s11
; VI-NEXT: v_add_u32_e32 v0, vcc, s10, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s13
; VI-NEXT: v_add_u32_e32 v2, vcc, s12, v6
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s15
; VI-NEXT: v_add_u32_e32 v4, vcc, s14, v6
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v8, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v3, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_trunc_f32_e32 v0, v8
; VI-NEXT: v_fma_f32 v2, -v0, v2, v3
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%d.gep = getelementptr inbounds float, ptr addrspace(1) %d.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%d = load volatile float, ptr addrspace(1) %d.gep
%trunc.a = call float @llvm.trunc.f32(float %a)
%trunc.fneg.a = fneg float %trunc.a
%fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c)
store volatile float %fma0, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @multi_use_cost_to_fold_into_src(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr, ptr addrspace(1) %d.ptr) #0 {
; SI-LABEL: multi_use_cost_to_fold_into_src:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11
; SI-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s11
; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v6
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s13
; SI-NEXT: v_add_i32_e32 v2, vcc, s12, v6
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, s15
; SI-NEXT: v_add_i32_e32 v4, vcc, s14, v6
; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; SI-NEXT: v_mov_b32_e32 v7, s1
; SI-NEXT: v_add_i32_e32 v6, vcc, s0, v6
; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; SI-NEXT: flat_load_dword v8, v[0:1] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v2, v[2:3] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v3, v[4:5] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_load_dword v4, v[6:7] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: v_trunc_f32_e32 v5, v8
; SI-NEXT: v_fma_f32 v2, -v5, v2, v3
; SI-NEXT: v_mul_f32_e32 v3, v5, v4
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: flat_store_dword v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: multi_use_cost_to_fold_into_src:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44
; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s11
; VI-NEXT: v_add_u32_e32 v0, vcc, s10, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s13
; VI-NEXT: v_add_u32_e32 v2, vcc, s12, v6
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s15
; VI-NEXT: v_add_u32_e32 v4, vcc, s14, v6
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_add_u32_e32 v6, vcc, s0, v6
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; VI-NEXT: flat_load_dword v8, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v3, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v4, v[6:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_trunc_f32_e32 v5, v8
; VI-NEXT: v_fma_f32 v2, -v5, v2, v3
; VI-NEXT: v_mul_f32_e32 v3, v5, v4
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
%d.gep = getelementptr inbounds float, ptr addrspace(1) %d.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%b = load volatile float, ptr addrspace(1) %b.gep
%c = load volatile float, ptr addrspace(1) %c.gep
%d = load volatile float, ptr addrspace(1) %d.gep
%trunc.a = call float @llvm.trunc.f32(float %a)
%trunc.fneg.a = fneg float %trunc.a
%fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c)
%mul1 = fmul float %trunc.a, %d
store volatile float %fma0, ptr addrspace(1) %out
store volatile float %mul1, ptr addrspace(1) %out
ret void
}
; The AMDGPU combine to pull fneg into the FMA operands was being
; undone by the generic combine to pull the fneg out of the fma if
; !isFNegFree. We were reporting false for v2f32 even though it will
; be split into f32 where it will be free.
define <2 x float> @fneg_fma_fneg_dagcombine_loop(<2 x float> %arg, <2 x float> %arg1, <2 x float> %arg2) #0 {
; GCN-LABEL: fneg_fma_fneg_dagcombine_loop:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_brev_b32 s4, 1
; GCN-NEXT: v_fma_f32 v3, v3, -v5, s4
; GCN-NEXT: v_fma_f32 v2, v2, -v4, s4
; GCN-NEXT: v_sub_f32_e32 v1, v3, v1
; GCN-NEXT: v_sub_f32_e32 v0, v2, v0
; GCN-NEXT: v_mul_f32_e32 v0, v0, v4
; GCN-NEXT: v_mul_f32_e32 v1, v1, v5
; GCN-NEXT: s_setpc_b64 s[30:31]
bb:
%i3 = call fast <2 x float> @llvm.fma.v2f32(<2 x float> %arg1, <2 x float> %arg2, <2 x float> zeroinitializer)
%i4 = fadd fast <2 x float> %i3, %arg
%i5 = fneg <2 x float> %i4
%i6 = fmul fast <2 x float> %i5, %arg2
ret <2 x float> %i6
}
; This expects denormal flushing, so can't turn this fmul into fneg
define float @nnan_fmul_neg1_to_fneg(float %x, float %y) #0 {
; GCN-LABEL: nnan_fmul_neg1_to_fneg:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e64 v0, -v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%mul = fmul float %x, -1.0
%add = fmul nnan float %mul, %y
ret float %add
}
; It's legal to turn this fmul into an fneg since denormals are
; preserved and we know an snan can't happen from the flag.
define float @denormal_fmul_neg1_to_fneg(float %x, float %y) {
; GCN-LABEL: denormal_fmul_neg1_to_fneg:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e64 v0, -v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%mul = fmul nnan float %x, -1.0
%add = fmul float %mul, %y
ret float %add
}
; know the source can't be an snan
define float @denorm_snan_fmul_neg1_to_fneg(float %x, float %y) {
; GCN-LABEL: denorm_snan_fmul_neg1_to_fneg:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e64 v0, v0, -v0
; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%canonical = fmul float %x, %x
%mul = fmul float %canonical, -1.0
%add = fmul float %mul, %y
ret float %add
}
define float @flush_snan_fmul_neg1_to_fneg(float %x, float %y) #0 {
; GCN-LABEL: flush_snan_fmul_neg1_to_fneg:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e64 v0, -v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%quiet = call float @llvm.canonicalize.f32(float %x)
%mul = fmul float %quiet, -1.0
%add = fmul float %mul, %y
ret float %add
}
define float @fadd_select_fneg_fneg_f32(i32 %arg0, float %x, float %y, float %z) {
; GCN-LABEL: fadd_select_fneg_fneg_f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GCN-NEXT: v_sub_f32_e32 v0, v3, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %arg0, 0
%neg.x = fneg float %x
%neg.y = fneg float %y
%select = select i1 %cmp, float %neg.x, float %neg.y
%add = fadd float %select, %z
ret float %add
}
define double @fadd_select_fneg_fneg_f64(i32 %arg0, double %x, double %y, double %z) {
; GCN-LABEL: fadd_select_fneg_fneg_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GCN-NEXT: v_add_f64 v[0:1], v[5:6], -v[1:2]
; GCN-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %arg0, 0
%neg.x = fneg double %x
%neg.y = fneg double %y
%select = select i1 %cmp, double %neg.x, double %neg.y
%add = fadd double %select, %z
ret double %add
}
define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) {
; SI-LABEL: fadd_select_fneg_fneg_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v3
; SI-NEXT: v_sub_f32_e32 v0, v1, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: fadd_select_fneg_fneg_f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: v_sub_f16_e32 v0, v3, v0
; VI-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %arg0, 0
%neg.x = fneg half %x
%neg.y = fneg half %y
%select = select i1 %cmp, half %neg.x, half %neg.y
%add = fadd half %select, %z
ret half %add
}
; FIXME: Terrible code for SI
define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x half> %y, <2 x half> %z) {
; SI-LABEL: fadd_select_fneg_fneg_v2f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-NEXT: v_cvt_f32_f16_e32 v2, v4
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_sub_f32_e32 v1, v2, v1
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_sub_f32_e32 v0, v3, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: fadd_select_fneg_fneg_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: v_sub_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_sub_f16_e32 v0, v3, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %arg0, 0
%neg.x = fneg <2 x half> %x
%neg.y = fneg <2 x half> %y
%select = select i1 %cmp, <2 x half> %neg.x, <2 x half> %neg.y
%add = fadd <2 x half> %select, %z
ret <2 x half> %add
}
; FIXME: This fneg should fold into select
define float @v_fneg_select_f32(i32 %arg0, float %a, float %b, float %c) {
; GCN-LABEL: v_fneg_select_f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg0, 0
%select = select i1 %cond, float %a, float %b
%fneg = fneg float %select
ret float %fneg
}
; FIXME: This fneg should fold into select
define float @v_fneg_select_2_f32(i32 %arg0, float %a, float %b, float %c) {
; GCN-LABEL: v_fneg_select_2_f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_add_f32_e32 v1, 2.0, v1
; GCN-NEXT: v_add_f32_e32 v2, 4.0, v2
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg0, 0
%add.0 = fadd float %a, 2.0
%add.1 = fadd float %b, 4.0
%select = select i1 %cond, float %add.0, float %add.1
%neg.select = fneg float %select
ret float %neg.select
}
define amdgpu_kernel void @v_fneg_posk_select_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) {
; SI-LABEL: v_fneg_posk_select_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s3
; SI-NEXT: v_add_i32_e32 v1, vcc, s2, v3
; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; SI-NEXT: flat_load_dword v4, v[1:2] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s1
; SI-NEXT: v_add_i32_e32 v1, vcc, s0, v3
; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 4.0, v4, vcc
; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; SI-NEXT: flat_store_dword v[1:2], v0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_posk_select_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v3
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dword v4, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v3
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, 4.0, v4, vcc
; VI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; VI-NEXT: flat_store_dword v[1:2], v0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%cond = icmp eq i32 %tid, 0
%select = select i1 %cond, float 4.0, float %a
%fneg = fneg float %select
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @v_fneg_negk_select_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) {
; SI-LABEL: v_fneg_negk_select_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s3
; SI-NEXT: v_add_i32_e32 v1, vcc, s2, v3
; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; SI-NEXT: flat_load_dword v4, v[1:2] glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s1
; SI-NEXT: v_add_i32_e32 v1, vcc, s0, v3
; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, -4.0, v4, vcc
; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; SI-NEXT: flat_store_dword v[1:2], v0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_fneg_negk_select_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v3
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dword v4, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v3
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, -4.0, v4, vcc
; VI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; VI-NEXT: flat_store_dword v[1:2], v0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%a = load volatile float, ptr addrspace(1) %a.gep
%cond = icmp eq i32 %tid, 0
%select = select i1 %cond, float -4.0, float %a
%fneg = fneg float %select
store float %fneg, ptr addrspace(1) %out.gep
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare float @llvm.fma.f32(float, float, float) #1
declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
declare float @llvm.fmuladd.f32(float, float, float) #1
declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #1
declare float @llvm.sin.f32(float) #1
declare float @llvm.trunc.f32(float) #1
declare float @llvm.round.f32(float) #1
declare float @llvm.rint.f32(float) #1
declare float @llvm.nearbyint.f32(float) #1
declare float @llvm.canonicalize.f32(float) #1
declare float @llvm.minnum.f32(float, float) #1
declare float @llvm.maxnum.f32(float, float) #1
declare half @llvm.minnum.f16(half, half) #1
declare double @llvm.minnum.f64(double, double) #1
declare double @llvm.fma.f64(double, double, double) #1
declare float @llvm.amdgcn.sin.f32(float) #1
declare float @llvm.amdgcn.rcp.f32(float) #1
declare float @llvm.amdgcn.rcp.legacy(float) #1
declare float @llvm.amdgcn.fmul.legacy(float, float) #1
declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0
declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0
attributes #0 = { nounwind denormal_fpenv(float: preservesign) }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }