From 49b5a1fa71d5af2b463c327e17017c754eb50801 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 3 Aug 2025 09:13:34 +0100 Subject: [PATCH] [AMDGPU] fmuladd.f32.ll - clean up prefixes and regenerate checks (#151832) Automate the fmuladd.f32.ll test checks as manually fixing changes while working on the topological dag patches was doing my head in --- llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll | 2841 ++++++++++++++++++++--- 1 file changed, 2518 insertions(+), 323 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll index ceacdf5e254a..cbda0621a318 100644 --- a/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll @@ -1,45 +1,184 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -mattr=+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=ieee -mattr=+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,SI-DENORM,GCN-DENORM-FASTFMA,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=preserve-sign -mattr=-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI-FLUSH,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=ieee -mattr=-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,SI-DENORM,GCN-DENORM-SLOWFMA,SI %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -mattr=+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=SI-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=preserve-sign -mattr=-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=SI-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=ieee -mattr=+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=SI-DENORM-FASTFMA,SI-DENORM-FASTFMA-STRICT %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=ieee -mattr=-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=SI-DENORM-SLOWFMA %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -mattr=+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI-FLUSH,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=ieee -mattr=+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=preserve-sign -mattr=-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI-FLUSH,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=ieee -mattr=-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -mattr=+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=preserve-sign -mattr=-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=ieee -mattr=+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI-DENORM-FASTFMA,SI-DENORM-FASTFMA-CONTRACT %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=ieee -mattr=-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI-DENORM-SLOWFMA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-FLUSH,GFX9-FLUSH-MAD %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DENORM,GFX9-DENORM-FASTFMA-MAD %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM-FASTFMA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-FLUSH,GFX9-FLUSH-FMAC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx906 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DENORM,GFX9-DENORM-FASTFMA-FMAC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-FMAC %s - -; FIXME: Should probably test this, but sometimes selecting fmac is painful to match. -; XUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx906 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM-FASTFMA %s - -; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -mattr=+mad-mac-f32-insts -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-FMAC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1030 -denormal-fp-math-f32=ieee -mattr=+mad-mac-f32-insts -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -mattr=+mad-mac-f32-insts -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1030 -denormal-fp-math-f32=ieee -mattr=+mad-mac-f32-insts -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10-DENORM %s ; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow. target triple = "amdgcn--" - declare i32 @llvm.amdgcn.workitem.id.x() #1 declare float @llvm.fmuladd.f32(float, float, float) #1 declare half @llvm.fmuladd.f16(half, half, half) #1 declare float @llvm.fabs.f32(float) #1 -; GCN-LABEL: {{^}}fmuladd_f32: -; GCN-FLUSH-MAD: v_mac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} -; GCN-FLUSH-FMAC: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} - -; GCN-DENORM-FASTFMA: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+}} - -; GCN-DENORM-SLOWFMA: v_mul_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} -; GCN-DENORM-SLOWFMA: v_add_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} -define amdgpu_kernel void @fmuladd_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, - ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 { +define amdgpu_kernel void @fmuladd_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 { +; SI-FLUSH-LABEL: fmuladd_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s11, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s10, -1 +; SI-FLUSH-NEXT: s_mov_b32 s14, s10 +; SI-FLUSH-NEXT: s_mov_b32 s15, s11 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: s_mov_b32 s12, s2 +; SI-FLUSH-NEXT: s_mov_b32 s13, s3 +; SI-FLUSH-NEXT: s_mov_b32 s16, s4 +; SI-FLUSH-NEXT: s_mov_b32 s17, s5 +; SI-FLUSH-NEXT: s_mov_b32 s18, s10 +; SI-FLUSH-NEXT: s_mov_b32 s19, s11 +; SI-FLUSH-NEXT: s_mov_b32 s4, s6 +; SI-FLUSH-NEXT: s_mov_b32 s5, s7 +; SI-FLUSH-NEXT: s_mov_b32 s6, s10 +; SI-FLUSH-NEXT: s_mov_b32 s7, s11 +; SI-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-FLUSH-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-FLUSH-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-FLUSH-NEXT: s_mov_b32 s8, s0 +; SI-FLUSH-NEXT: s_mov_b32 s9, s1 +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 +; SI-FLUSH-NEXT: buffer_store_dword v2, off, s[8:11], 0 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-LABEL: fmuladd_f32: +; SI-DENORM-FASTFMA: ; %bb.0: +; SI-DENORM-FASTFMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s10, -1 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s14, s10 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s15, s11 +; SI-DENORM-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s12, s2 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s13, s3 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s16, s4 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s17, s5 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s18, s10 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s19, s11 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s4, s6 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s5, s7 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s6, s10 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s7, s11 +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s8, s0 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s9, s1 +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v2 +; SI-DENORM-FASTFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-DENORM-FASTFMA-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: fmuladd_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s10, -1 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s14, s10 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s15, s11 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s12, s2 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s13, s3 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s16, s4 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s17, s5 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s18, s10 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s19, s11 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s4, s6 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s5, s7 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, s10 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, s11 +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s8, s0 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s9, s1 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(1) +; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; GFX9-FLUSH-MAD-LABEL: fmuladd_f32: +; GFX9-FLUSH-MAD: ; %bb.0: +; GFX9-FLUSH-MAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-FLUSH-MAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-FLUSH-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v1, v0, s[10:11] +; GFX9-FLUSH-MAD-NEXT: global_load_dword v2, v0, s[12:13] +; GFX9-FLUSH-MAD-NEXT: global_load_dword v3, v0, s[14:15] +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: v_mac_f32_e32 v3, v1, v2 +; GFX9-FLUSH-MAD-NEXT: global_store_dword v0, v3, s[8:9] +; GFX9-FLUSH-MAD-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-MAD-LABEL: fmuladd_f32: +; GFX9-DENORM-FASTFMA-MAD: ; %bb.0: +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v1, v0, s[10:11] +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v2, v0, s[12:13] +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v3, v0, s[14:15] +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_fma_f32 v1, v1, v2, v3 +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_endpgm +; +; GFX9-FLUSH-FMAC-LABEL: fmuladd_f32: +; GFX9-FLUSH-FMAC: ; %bb.0: +; GFX9-FLUSH-FMAC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-FLUSH-FMAC-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v1, v0, s[10:11] +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v2, v0, s[12:13] +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v3, v0, s[14:15] +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX9-FLUSH-FMAC-NEXT: global_store_dword v0, v3, s[8:9] +; GFX9-FLUSH-FMAC-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-FMAC-LABEL: fmuladd_f32: +; GFX9-DENORM-FASTFMA-FMAC: ; %bb.0: +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v1, v0, s[10:11] +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v2, v0, s[12:13] +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v3, v0, s[14:15] +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_store_dword v0, v3, s[8:9] +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_endpgm +; +; GFX10-LABEL: fmuladd_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[4:5] +; GFX10-NEXT: global_load_dword v3, v0, s[6:7] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX10-NEXT: global_store_dword v0, v3, s[0:1] +; GFX10-NEXT: s_endpgm %r0 = load float, ptr addrspace(1) %in1 %r1 = load float, ptr addrspace(1) %in2 %r2 = load float, ptr addrspace(1) %in3 @@ -48,18 +187,190 @@ define amdgpu_kernel void @fmuladd_f32(ptr addrspace(1) %out, ptr addrspace(1) % ret void } -; GCN-LABEL: {{^}}fmul_fadd_f32: -; GCN-FLUSH: v_mac_f32 - -; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 - -; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 -; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 - -; GCN-DENORM-STRICT: v_mul_f32_e32 -; GCN-DENORM-STRICT: v_add_f32_e32 -define amdgpu_kernel void @fmul_fadd_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, - ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 { +define amdgpu_kernel void @fmul_fadd_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 { +; SI-FLUSH-LABEL: fmul_fadd_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s11, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s10, -1 +; SI-FLUSH-NEXT: s_mov_b32 s14, s10 +; SI-FLUSH-NEXT: s_mov_b32 s15, s11 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: s_mov_b32 s12, s2 +; SI-FLUSH-NEXT: s_mov_b32 s13, s3 +; SI-FLUSH-NEXT: s_mov_b32 s16, s4 +; SI-FLUSH-NEXT: s_mov_b32 s17, s5 +; SI-FLUSH-NEXT: s_mov_b32 s18, s10 +; SI-FLUSH-NEXT: s_mov_b32 s19, s11 +; SI-FLUSH-NEXT: s_mov_b32 s4, s6 +; SI-FLUSH-NEXT: s_mov_b32 s5, s7 +; SI-FLUSH-NEXT: s_mov_b32 s6, s10 +; SI-FLUSH-NEXT: s_mov_b32 s7, s11 +; SI-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: s_mov_b32 s8, s0 +; SI-FLUSH-NEXT: s_mov_b32 s9, s1 +; SI-FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 +; SI-FLUSH-NEXT: buffer_store_dword v2, off, s[8:11], 0 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-STRICT-LABEL: fmul_fadd_f32: +; SI-DENORM-FASTFMA-STRICT: ; %bb.0: +; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s11, 0xf000 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s10, -1 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s14, s10 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s15, s11 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s12, s2 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s13, s3 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s16, s4 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s17, s5 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s18, s10 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s19, s11 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s4, s6 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s5, s7 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s6, s10 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s7, s11 +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s8, s0 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s9, s1 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: fmul_fadd_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s10, -1 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s14, s10 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s15, s11 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s12, s2 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s13, s3 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s16, s4 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s17, s5 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s18, s10 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s19, s11 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s4, s6 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s5, s7 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, s10 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, s11 +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s8, s0 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s9, s1 +; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-CONTRACT-LABEL: fmul_fadd_f32: +; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0: +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s11, 0xf000 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s10, -1 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s14, s10 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s15, s11 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s12, s2 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s13, s3 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s16, s4 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s17, s5 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s18, s10 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s19, s11 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s4, s6 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s5, s7 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s6, s10 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s7, s11 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s8, s0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s9, s1 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm +; +; GFX9-FLUSH-LABEL: fmul_fadd_f32: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-FLUSH-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v3, v1, v2 +; GFX9-FLUSH-NEXT: global_store_dword v0, v3, s[8:9] +; GFX9-FLUSH-NEXT: s_endpgm +; +; GFX9-DENORM-LABEL: fmul_fadd_f32: +; GFX9-DENORM: ; %bb.0: +; GFX9-DENORM-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fmul_fadd_f32: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[4:5] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v3, v0, s[6:7] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v3, v1, v2 +; GFX10-FLUSH-NEXT: global_store_dword v0, v3, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: fmul_fadd_f32: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[4:5] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v3, v0, s[6:7] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm %r0 = load volatile float, ptr addrspace(1) %in1 %r1 = load volatile float, ptr addrspace(1) %in2 %r2 = load volatile float, ptr addrspace(1) %in3 @@ -69,15 +380,172 @@ define amdgpu_kernel void @fmul_fadd_f32(ptr addrspace(1) %out, ptr addrspace(1) ret void } -; GCN-LABEL: {{^}}fmul_fadd_contract_f32: -; GCN-FLUSH-FMAC: v_fmac_f32_e32 - -; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 -; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 - -; GCN-DENORM-FASTFMA: v_fma_f32 -define amdgpu_kernel void @fmul_fadd_contract_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, - ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 { +define amdgpu_kernel void @fmul_fadd_contract_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 { +; SI-FLUSH-LABEL: fmul_fadd_contract_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s11, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s10, -1 +; SI-FLUSH-NEXT: s_mov_b32 s14, s10 +; SI-FLUSH-NEXT: s_mov_b32 s15, s11 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: s_mov_b32 s12, s2 +; SI-FLUSH-NEXT: s_mov_b32 s13, s3 +; SI-FLUSH-NEXT: s_mov_b32 s16, s4 +; SI-FLUSH-NEXT: s_mov_b32 s17, s5 +; SI-FLUSH-NEXT: s_mov_b32 s18, s10 +; SI-FLUSH-NEXT: s_mov_b32 s19, s11 +; SI-FLUSH-NEXT: s_mov_b32 s4, s6 +; SI-FLUSH-NEXT: s_mov_b32 s5, s7 +; SI-FLUSH-NEXT: s_mov_b32 s6, s10 +; SI-FLUSH-NEXT: s_mov_b32 s7, s11 +; SI-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: s_mov_b32 s8, s0 +; SI-FLUSH-NEXT: s_mov_b32 s9, s1 +; SI-FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 +; SI-FLUSH-NEXT: buffer_store_dword v2, off, s[8:11], 0 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-LABEL: fmul_fadd_contract_f32: +; SI-DENORM-FASTFMA: ; %bb.0: +; SI-DENORM-FASTFMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s10, -1 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s14, s10 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s15, s11 +; SI-DENORM-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s12, s2 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s13, s3 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s16, s4 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s17, s5 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s18, s10 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s19, s11 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s4, s6 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s5, s7 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s6, s10 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s7, s11 +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s8, s0 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s9, s1 +; SI-DENORM-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v2 +; SI-DENORM-FASTFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-DENORM-FASTFMA-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: fmul_fadd_contract_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s10, -1 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s14, s10 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s15, s11 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s12, s2 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s13, s3 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s16, s4 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s17, s5 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s18, s10 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s19, s11 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s4, s6 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s5, s7 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, s10 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, s11 +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s8, s0 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s9, s1 +; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; GFX9-FLUSH-MAD-LABEL: fmul_fadd_contract_f32: +; GFX9-FLUSH-MAD: ; %bb.0: +; GFX9-FLUSH-MAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-FLUSH-MAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-FLUSH-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: v_mac_f32_e32 v3, v1, v2 +; GFX9-FLUSH-MAD-NEXT: global_store_dword v0, v3, s[8:9] +; GFX9-FLUSH-MAD-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-MAD-LABEL: fmul_fadd_contract_f32: +; GFX9-DENORM-FASTFMA-MAD: ; %bb.0: +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_fma_f32 v1, v1, v2, v3 +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_endpgm +; +; GFX9-FLUSH-FMAC-LABEL: fmul_fadd_contract_f32: +; GFX9-FLUSH-FMAC: ; %bb.0: +; GFX9-FLUSH-FMAC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-FLUSH-FMAC-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX9-FLUSH-FMAC-NEXT: global_store_dword v0, v3, s[8:9] +; GFX9-FLUSH-FMAC-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-FMAC-LABEL: fmul_fadd_contract_f32: +; GFX9-DENORM-FASTFMA-FMAC: ; %bb.0: +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_store_dword v0, v3, s[8:9] +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_endpgm +; +; GFX10-LABEL: fmul_fadd_contract_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[4:5] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX10-NEXT: global_store_dword v0, v3, s[0:1] +; GFX10-NEXT: s_endpgm %r0 = load volatile float, ptr addrspace(1) %in1 %r1 = load volatile float, ptr addrspace(1) %in2 %r2 = load volatile float, ptr addrspace(1) %in3 @@ -87,23 +555,120 @@ define amdgpu_kernel void @fmul_fadd_contract_f32(ptr addrspace(1) %out, ptr add ret void } -; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f32 -; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], - -; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]] -; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]] -; SI-FLUSH: buffer_store_dword [[R2]] -; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] - -; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] - -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] - -; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fmuladd_2.0_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-FLUSH-LABEL: fmuladd_2.0_a_b_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s2, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: v_mac_f32_e32 v3, 2.0, v2 +; SI-FLUSH-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-LABEL: fmuladd_2.0_a_b_f32: +; SI-DENORM-FASTFMA: ; %bb.0: +; SI-DENORM-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: v_fma_f32 v2, v2, 2.0, v3 +; SI-DENORM-FASTFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: fmuladd_2.0_a_b_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v3 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; GFX9-FLUSH-MAD-LABEL: fmuladd_2.0_a_b_f32: +; GFX9-FLUSH-MAD: ; %bb.0: +; GFX9-FLUSH-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: v_mac_f32_e32 v2, 2.0, v1 +; GFX9-FLUSH-MAD-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-MAD-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-MAD-LABEL: fmuladd_2.0_a_b_f32: +; GFX9-DENORM-FASTFMA-MAD: ; %bb.0: +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_fma_f32 v1, v1, 2.0, v2 +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_endpgm +; +; GFX9-FLUSH-FMAC-LABEL: fmuladd_2.0_a_b_f32: +; GFX9-FLUSH-FMAC: ; %bb.0: +; GFX9-FLUSH-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: v_fmac_f32_e32 v2, 2.0, v1 +; GFX9-FLUSH-FMAC-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-FMAC-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-FMAC-LABEL: fmuladd_2.0_a_b_f32: +; GFX9-DENORM-FASTFMA-FMAC: ; %bb.0: +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_fmac_f32_e32 v2, 2.0, v1 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_endpgm +; +; GFX10-LABEL: fmuladd_2.0_a_b_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_fmac_f32_e32 v2, 2.0, v1 +; GFX10-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 @@ -117,24 +682,120 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f32(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f32 -; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], - -; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]] -; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]] - -; SI-FLUSH: buffer_store_dword [[R2]] -; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] - -; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] - -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] - -; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fmuladd_a_2.0_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-FLUSH-LABEL: fmuladd_a_2.0_b_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s2, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: v_mac_f32_e32 v3, 2.0, v2 +; SI-FLUSH-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-LABEL: fmuladd_a_2.0_b_f32: +; SI-DENORM-FASTFMA: ; %bb.0: +; SI-DENORM-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: v_fma_f32 v2, v2, 2.0, v3 +; SI-DENORM-FASTFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: fmuladd_a_2.0_b_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v3 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; GFX9-FLUSH-MAD-LABEL: fmuladd_a_2.0_b_f32: +; GFX9-FLUSH-MAD: ; %bb.0: +; GFX9-FLUSH-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: v_mac_f32_e32 v2, 2.0, v1 +; GFX9-FLUSH-MAD-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-MAD-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-MAD-LABEL: fmuladd_a_2.0_b_f32: +; GFX9-DENORM-FASTFMA-MAD: ; %bb.0: +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_fma_f32 v1, v1, 2.0, v2 +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_endpgm +; +; GFX9-FLUSH-FMAC-LABEL: fmuladd_a_2.0_b_f32: +; GFX9-FLUSH-FMAC: ; %bb.0: +; GFX9-FLUSH-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: v_fmac_f32_e32 v2, 2.0, v1 +; GFX9-FLUSH-FMAC-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-FMAC-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-FMAC-LABEL: fmuladd_a_2.0_b_f32: +; GFX9-DENORM-FASTFMA-FMAC: ; %bb.0: +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_fmac_f32_e32 v2, 2.0, v1 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_endpgm +; +; GFX10-LABEL: fmuladd_a_2.0_b_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_fmac_f32_e32 v2, 2.0, v1 +; GFX10-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 @@ -148,28 +809,126 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f32(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}fadd_a_a_b_f32: -; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], - -; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] - -; SI-FLUSH: buffer_store_dword [[R2]] -; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] - -; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] - -; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] - -; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] - -; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define amdgpu_kernel void @fadd_a_a_b_f32(ptr addrspace(1) %out, - ptr addrspace(1) %in1, - ptr addrspace(1) %in2) #0 { +define amdgpu_kernel void @fadd_a_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { +; SI-FLUSH-LABEL: fadd_a_a_b_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s2, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: v_mac_f32_e32 v3, 2.0, v2 +; SI-FLUSH-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-STRICT-LABEL: fadd_a_a_b_f32: +; SI-DENORM-FASTFMA-STRICT: ; %bb.0: +; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_add_f32_e32 v2, v2, v3 +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: fadd_a_a_b_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v3 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-CONTRACT-LABEL: fadd_a_a_b_f32: +; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0: +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, v2, 2.0, v3 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm +; +; GFX9-FLUSH-LABEL: fadd_a_a_b_f32: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v2, 2.0, v1 +; GFX9-FLUSH-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-NEXT: s_endpgm +; +; GFX9-DENORM-LABEL: fadd_a_a_b_f32: +; GFX9-DENORM: ; %bb.0: +; GFX9-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fadd_a_a_b_f32: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v2, 2.0, v1 +; GFX10-FLUSH-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: fadd_a_a_b_f32: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 @@ -184,28 +943,126 @@ define amdgpu_kernel void @fadd_a_a_b_f32(ptr addrspace(1) %out, ret void } -; GCN-LABEL: {{^}}fadd_b_a_a_f32: -; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], - -; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] - -; SI-FLUSH: buffer_store_dword [[R2]] -; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] - -; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] - -; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] - -; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] - -; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define amdgpu_kernel void @fadd_b_a_a_f32(ptr addrspace(1) %out, - ptr addrspace(1) %in1, - ptr addrspace(1) %in2) #0 { +define amdgpu_kernel void @fadd_b_a_a_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { +; SI-FLUSH-LABEL: fadd_b_a_a_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s2, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: v_mac_f32_e32 v3, 2.0, v2 +; SI-FLUSH-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-STRICT-LABEL: fadd_b_a_a_f32: +; SI-DENORM-FASTFMA-STRICT: ; %bb.0: +; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_add_f32_e32 v2, v3, v2 +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: fadd_b_a_a_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v3, v2 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-CONTRACT-LABEL: fadd_b_a_a_f32: +; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0: +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, v2, 2.0, v3 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm +; +; GFX9-FLUSH-LABEL: fadd_b_a_a_f32: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v2, 2.0, v1 +; GFX9-FLUSH-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-NEXT: s_endpgm +; +; GFX9-DENORM-LABEL: fadd_b_a_a_f32: +; GFX9-DENORM: ; %bb.0: +; GFX9-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fadd_b_a_a_f32: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v2, 2.0, v1 +; GFX10-FLUSH-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: fadd_b_a_a_f32: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 @@ -220,20 +1077,120 @@ define amdgpu_kernel void @fadd_b_a_a_f32(ptr addrspace(1) %out, ret void } -; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32 -; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], -; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], -2.0, [[R1]] -; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], -2.0, [[R1]] - -; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] - -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] - -; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s2, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: v_mac_f32_e32 v3, -2.0, v2 +; SI-FLUSH-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-LABEL: fmuladd_neg_2.0_a_b_f32: +; SI-DENORM-FASTFMA: ; %bb.0: +; SI-DENORM-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: v_fma_f32 v2, v2, -2.0, v3 +; SI-DENORM-FASTFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: fmuladd_neg_2.0_a_b_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e32 v2, v3, v2 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; GFX9-FLUSH-MAD-LABEL: fmuladd_neg_2.0_a_b_f32: +; GFX9-FLUSH-MAD: ; %bb.0: +; GFX9-FLUSH-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: v_mac_f32_e32 v2, -2.0, v1 +; GFX9-FLUSH-MAD-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-MAD-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-MAD-LABEL: fmuladd_neg_2.0_a_b_f32: +; GFX9-DENORM-FASTFMA-MAD: ; %bb.0: +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_fma_f32 v1, v1, -2.0, v2 +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_endpgm +; +; GFX9-FLUSH-FMAC-LABEL: fmuladd_neg_2.0_a_b_f32: +; GFX9-FLUSH-FMAC: ; %bb.0: +; GFX9-FLUSH-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: v_fmac_f32_e32 v2, -2.0, v1 +; GFX9-FLUSH-FMAC-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-FMAC-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-FMAC-LABEL: fmuladd_neg_2.0_a_b_f32: +; GFX9-DENORM-FASTFMA-FMAC: ; %bb.0: +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_fmac_f32_e32 v2, -2.0, v1 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_endpgm +; +; GFX10-LABEL: fmuladd_neg_2.0_a_b_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_fmac_f32_e32 v2, -2.0, v1 +; GFX10-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 @@ -247,25 +1204,120 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(ptr addrspace(1) %out, ptr ad ret void } -; XXX -; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32 -; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], - -; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]] -; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]] - -; SI-FLUSH: buffer_store_dword [[R2]] -; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] - -; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] - -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] - -; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s2, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: v_mac_f32_e32 v3, 2.0, v2 +; SI-FLUSH-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-LABEL: fmuladd_neg_2.0_neg_a_b_f32: +; SI-DENORM-FASTFMA: ; %bb.0: +; SI-DENORM-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: v_fma_f32 v2, v2, 2.0, v3 +; SI-DENORM-FASTFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: fmuladd_neg_2.0_neg_a_b_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v3, v2 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; GFX9-FLUSH-MAD-LABEL: fmuladd_neg_2.0_neg_a_b_f32: +; GFX9-FLUSH-MAD: ; %bb.0: +; GFX9-FLUSH-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: v_mac_f32_e32 v2, 2.0, v1 +; GFX9-FLUSH-MAD-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-MAD-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-MAD-LABEL: fmuladd_neg_2.0_neg_a_b_f32: +; GFX9-DENORM-FASTFMA-MAD: ; %bb.0: +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_fma_f32 v1, v1, 2.0, v2 +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_endpgm +; +; GFX9-FLUSH-FMAC-LABEL: fmuladd_neg_2.0_neg_a_b_f32: +; GFX9-FLUSH-FMAC: ; %bb.0: +; GFX9-FLUSH-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: v_fmac_f32_e32 v2, 2.0, v1 +; GFX9-FLUSH-FMAC-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-FMAC-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-FMAC-LABEL: fmuladd_neg_2.0_neg_a_b_f32: +; GFX9-DENORM-FASTFMA-FMAC: ; %bb.0: +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_fmac_f32_e32 v2, 2.0, v1 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_endpgm +; +; GFX10-LABEL: fmuladd_neg_2.0_neg_a_b_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_fmac_f32_e32 v2, 2.0, v1 +; GFX10-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 @@ -281,24 +1333,120 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, pt ret void } -; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32: -; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], - -; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], -2.0, [[R1]] -; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], -2.0, [[R1]] - -; SI-FLUSH: buffer_store_dword [[R2]] -; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] - -; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] - -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] - -; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s2, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: v_mac_f32_e32 v3, -2.0, v2 +; SI-FLUSH-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-LABEL: fmuladd_2.0_neg_a_b_f32: +; SI-DENORM-FASTFMA: ; %bb.0: +; SI-DENORM-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: v_fma_f32 v2, v2, -2.0, v3 +; SI-DENORM-FASTFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: fmuladd_2.0_neg_a_b_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e32 v2, v3, v2 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; GFX9-FLUSH-MAD-LABEL: fmuladd_2.0_neg_a_b_f32: +; GFX9-FLUSH-MAD: ; %bb.0: +; GFX9-FLUSH-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: v_mac_f32_e32 v2, -2.0, v1 +; GFX9-FLUSH-MAD-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-MAD-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-MAD-LABEL: fmuladd_2.0_neg_a_b_f32: +; GFX9-DENORM-FASTFMA-MAD: ; %bb.0: +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_fma_f32 v1, v1, -2.0, v2 +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_endpgm +; +; GFX9-FLUSH-FMAC-LABEL: fmuladd_2.0_neg_a_b_f32: +; GFX9-FLUSH-FMAC: ; %bb.0: +; GFX9-FLUSH-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: v_fmac_f32_e32 v2, -2.0, v1 +; GFX9-FLUSH-FMAC-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-FMAC-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-FMAC-LABEL: fmuladd_2.0_neg_a_b_f32: +; GFX9-DENORM-FASTFMA-FMAC: ; %bb.0: +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_fmac_f32_e32 v2, -2.0, v1 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_endpgm +; +; GFX10-LABEL: fmuladd_2.0_neg_a_b_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_fmac_f32_e32 v2, -2.0, v1 +; GFX10-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 @@ -314,23 +1462,107 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr ad ret void } -; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32: -; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], -; GCN-FLUSH-MAD: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] -; GCN-FLUSH-FMAC: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] - -; SI-FLUSH: buffer_store_dword [[RESULT]] -; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] - -; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] - -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] - -; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s2, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: v_mad_f32 v2, v2, 2.0, -v3 +; SI-FLUSH-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-LABEL: fmuladd_2.0_a_neg_b_f32: +; SI-DENORM-FASTFMA: ; %bb.0: +; SI-DENORM-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: v_fma_f32 v2, v2, 2.0, -v3 +; SI-DENORM-FASTFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: fmuladd_2.0_a_neg_b_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e32 v2, v2, v3 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; GFX9-FLUSH-MAD-LABEL: fmuladd_2.0_a_neg_b_f32: +; GFX9-FLUSH-MAD: ; %bb.0: +; GFX9-FLUSH-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: v_mad_f32 v1, v1, 2.0, -v2 +; GFX9-FLUSH-MAD-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-FLUSH-MAD-NEXT: s_endpgm +; +; GFX9-DENORM-LABEL: fmuladd_2.0_a_neg_b_f32: +; GFX9-DENORM: ; %bb.0: +; GFX9-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_fma_f32 v1, v1, 2.0, -v2 +; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-NEXT: s_endpgm +; +; GFX9-FLUSH-FMAC-LABEL: fmuladd_2.0_a_neg_b_f32: +; GFX9-FLUSH-FMAC: ; %bb.0: +; GFX9-FLUSH-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: v_fma_f32 v1, v1, 2.0, -v2 +; GFX9-FLUSH-FMAC-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-FLUSH-FMAC-NEXT: s_endpgm +; +; GFX10-LABEL: fmuladd_2.0_a_neg_b_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_fma_f32 v1, v1, 2.0, -v2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 @@ -346,23 +1578,150 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(ptr addrspace(1) %out, ptr ad ret void } -; GCN-LABEL: {{^}}mad_sub_f32: -; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] -; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] - -; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] - -; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] - -; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] - -; SI: buffer_store_dword [[RESULT]] -; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @mad_sub_f32(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #0 { +; SI-FLUSH-LABEL: mad_sub_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s6, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-FLUSH-NEXT: v_mad_f32 v2, v2, v3, -v4 +; SI-FLUSH-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-STRICT-LABEL: mad_sub_f32: +; SI-DENORM-FASTFMA-STRICT: ; %bb.0: +; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_sub_f32_e32 v2, v2, v4 +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: mad_sub_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e32 v2, v2, v4 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-CONTRACT-LABEL: mad_sub_f32: +; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0: +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, v2, v3, -v4 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm +; +; GFX9-FLUSH-LABEL: mad_sub_f32: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: v_mad_f32 v1, v1, v2, -v3 +; GFX9-FLUSH-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-FLUSH-NEXT: s_endpgm +; +; GFX9-DENORM-LABEL: mad_sub_f32: +; GFX9-DENORM: ; %bb.0: +; GFX9-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX9-DENORM-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: mad_sub_f32: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mad_f32 v1, v1, v2, -v3 +; GFX10-FLUSH-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: mad_sub_f32: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX10-DENORM-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 %tid.ext @@ -380,24 +1739,150 @@ define amdgpu_kernel void @mad_sub_f32(ptr addrspace(1) noalias nocapture %out, ret void } -; GCN-LABEL: {{^}}mad_sub_inv_f32: -; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] - -; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] - -; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] - -; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] - -; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] - -; SI: buffer_store_dword [[RESULT]] -; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @mad_sub_inv_f32(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #0 { +; SI-FLUSH-LABEL: mad_sub_inv_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s6, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-FLUSH-NEXT: v_mad_f32 v2, -v2, v3, v4 +; SI-FLUSH-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-STRICT-LABEL: mad_sub_inv_f32: +; SI-DENORM-FASTFMA-STRICT: ; %bb.0: +; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_sub_f32_e32 v2, v4, v2 +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: mad_sub_inv_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e32 v2, v4, v2 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-CONTRACT-LABEL: mad_sub_inv_f32: +; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0: +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, -v2, v3, v4 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm +; +; GFX9-FLUSH-LABEL: mad_sub_inv_f32: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: v_mad_f32 v1, -v1, v2, v3 +; GFX9-FLUSH-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-FLUSH-NEXT: s_endpgm +; +; GFX9-DENORM-LABEL: mad_sub_inv_f32: +; GFX9-DENORM: ; %bb.0: +; GFX9-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX9-DENORM-NEXT: v_sub_f32_e32 v1, v3, v1 +; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: mad_sub_inv_f32: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mad_f32 v1, -v1, v2, v3 +; GFX10-FLUSH-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: mad_sub_inv_f32: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX10-DENORM-NEXT: v_sub_f32_e32 v1, v3, v1 +; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 %tid.ext @@ -415,23 +1900,150 @@ define amdgpu_kernel void @mad_sub_inv_f32(ptr addrspace(1) noalias nocapture %o ret void } -; GCN-LABEL: {{^}}mad_sub_fabs_f32: -; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] -; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| - -; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| - -; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| - -; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| - -; SI: buffer_store_dword [[RESULT]] -; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @mad_sub_fabs_f32(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #0 { +; SI-FLUSH-LABEL: mad_sub_fabs_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s6, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-FLUSH-NEXT: v_mad_f32 v2, v2, v3, -|v4| +; SI-FLUSH-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-STRICT-LABEL: mad_sub_fabs_f32: +; SI-DENORM-FASTFMA-STRICT: ; %bb.0: +; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_sub_f32_e64 v2, v2, |v4| +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: mad_sub_fabs_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e64 v2, v2, |v4| +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-CONTRACT-LABEL: mad_sub_fabs_f32: +; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0: +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, v2, v3, -|v4| +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm +; +; GFX9-FLUSH-LABEL: mad_sub_fabs_f32: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: v_mad_f32 v1, v1, v2, -|v3| +; GFX9-FLUSH-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-FLUSH-NEXT: s_endpgm +; +; GFX9-DENORM-LABEL: mad_sub_fabs_f32: +; GFX9-DENORM: ; %bb.0: +; GFX9-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX9-DENORM-NEXT: v_sub_f32_e64 v1, v1, |v3| +; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: mad_sub_fabs_f32: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mad_f32 v1, v1, v2, -|v3| +; GFX10-FLUSH-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: mad_sub_fabs_f32: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX10-DENORM-NEXT: v_sub_f32_e64 v1, v1, |v3| +; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 %tid.ext @@ -450,24 +2062,150 @@ define amdgpu_kernel void @mad_sub_fabs_f32(ptr addrspace(1) noalias nocapture % ret void } -; GCN-LABEL: {{^}}mad_sub_fabs_inv_f32: -; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] -; GCN-FLUSH-MAD: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| -; GCN-FLUSH-FMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| - -; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| - -; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] - -; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] - -; SI: buffer_store_dword [[RESULT]] -; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @mad_sub_fabs_inv_f32(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #0 { +; SI-FLUSH-LABEL: mad_sub_fabs_inv_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s6, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-FLUSH-NEXT: v_mad_f32 v2, -v2, v3, |v4| +; SI-FLUSH-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-STRICT-LABEL: mad_sub_fabs_inv_f32: +; SI-DENORM-FASTFMA-STRICT: ; %bb.0: +; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_sub_f32_e64 v2, |v4|, v2 +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: mad_sub_fabs_inv_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e64 v2, |v4|, v2 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-CONTRACT-LABEL: mad_sub_fabs_inv_f32: +; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0: +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, -v2, v3, |v4| +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm +; +; GFX9-FLUSH-LABEL: mad_sub_fabs_inv_f32: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: v_mad_f32 v1, -v1, v2, |v3| +; GFX9-FLUSH-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-FLUSH-NEXT: s_endpgm +; +; GFX9-DENORM-LABEL: mad_sub_fabs_inv_f32: +; GFX9-DENORM: ; %bb.0: +; GFX9-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX9-DENORM-NEXT: v_sub_f32_e64 v1, |v3|, v1 +; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: mad_sub_fabs_inv_f32: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mad_f32 v1, -v1, v2, |v3| +; GFX10-FLUSH-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: mad_sub_fabs_inv_f32: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX10-DENORM-NEXT: v_sub_f32_e64 v1, |v3|, v1 +; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 %tid.ext @@ -486,26 +2224,150 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f32(ptr addrspace(1) noalias nocaptu ret void } -; GCN-LABEL: {{^}}neg_neg_mad_f32: -; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] - -; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGA]], [[REGB]] -; SI-FLUSH: buffer_store_dword [[REGC]] -; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] - -; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]] - -; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] - -; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] - -; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @neg_neg_mad_f32(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #0 { +; SI-FLUSH-LABEL: neg_neg_mad_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s6, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-FLUSH-NEXT: v_mac_f32_e32 v4, v2, v3 +; SI-FLUSH-NEXT: buffer_store_dword v4, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-STRICT-LABEL: neg_neg_mad_f32: +; SI-DENORM-FASTFMA-STRICT: ; %bb.0: +; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_add_f32_e32 v2, v4, v2 +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: neg_neg_mad_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v4, v2 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-CONTRACT-LABEL: neg_neg_mad_f32: +; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0: +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, v2, v3, v4 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm +; +; GFX9-FLUSH-LABEL: neg_neg_mad_f32: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v3, v1, v2 +; GFX9-FLUSH-NEXT: global_store_dword v0, v3, s[0:1] +; GFX9-FLUSH-NEXT: s_endpgm +; +; GFX9-DENORM-LABEL: neg_neg_mad_f32: +; GFX9-DENORM: ; %bb.0: +; GFX9-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: neg_neg_mad_f32: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v3, v1, v2 +; GFX10-FLUSH-NEXT: global_store_dword v0, v3, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: neg_neg_mad_f32: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 %tid.ext @@ -525,23 +2387,150 @@ define amdgpu_kernel void @neg_neg_mad_f32(ptr addrspace(1) noalias nocapture %o ret void } -; GCN-LABEL: {{^}}mad_fabs_sub_f32: -; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] -; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] - -; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] - -; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| -; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] - -; GCN-DENORM-STRICT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| -; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] - -; SI: buffer_store_dword [[RESULT]] -; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @mad_fabs_sub_f32(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #0 { +; SI-FLUSH-LABEL: mad_fabs_sub_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s6, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-FLUSH-NEXT: v_mad_f32 v2, v2, |v3|, -v4 +; SI-FLUSH-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-STRICT-LABEL: mad_fabs_sub_f32: +; SI-DENORM-FASTFMA-STRICT: ; %bb.0: +; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mul_f32_e64 v2, v2, |v3| +; SI-DENORM-FASTFMA-STRICT-NEXT: v_sub_f32_e32 v2, v2, v4 +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: mad_fabs_sub_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e64 v2, v2, |v3| +; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e32 v2, v2, v4 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-CONTRACT-LABEL: mad_fabs_sub_f32: +; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0: +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, v2, |v3|, -v4 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm +; +; GFX9-FLUSH-LABEL: mad_fabs_sub_f32: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: v_mad_f32 v1, v1, |v2|, -v3 +; GFX9-FLUSH-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-FLUSH-NEXT: s_endpgm +; +; GFX9-DENORM-LABEL: mad_fabs_sub_f32: +; GFX9-DENORM: ; %bb.0: +; GFX9-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_mul_f32_e64 v1, v1, |v2| +; GFX9-DENORM-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: mad_fabs_sub_f32: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mad_f32 v1, v1, |v2|, -v3 +; GFX10-FLUSH-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: mad_fabs_sub_f32: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_mul_f32_e64 v1, v1, |v2| +; GFX10-DENORM-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 %tid.ext @@ -560,24 +2549,126 @@ define amdgpu_kernel void @mad_fabs_sub_f32(ptr addrspace(1) noalias nocapture % ret void } -; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f32: -; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], -; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]] -; SI-FLUSH: buffer_store_dword [[R2]] -; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] - -; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] - -; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] - -; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] - -; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fsub_c_fadd_a_a_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-FLUSH-LABEL: fsub_c_fadd_a_a_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s2, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: v_mac_f32_e32 v3, -2.0, v2 +; SI-FLUSH-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-STRICT-LABEL: fsub_c_fadd_a_a_f32: +; SI-DENORM-FASTFMA-STRICT: ; %bb.0: +; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_sub_f32_e32 v2, v3, v2 +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: fsub_c_fadd_a_a_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e32 v2, v3, v2 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-CONTRACT-LABEL: fsub_c_fadd_a_a_f32: +; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0: +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, v2, -2.0, v3 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm +; +; GFX9-FLUSH-LABEL: fsub_c_fadd_a_a_f32: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v2, -2.0, v1 +; GFX9-FLUSH-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-NEXT: s_endpgm +; +; GFX9-DENORM-LABEL: fsub_c_fadd_a_a_f32: +; GFX9-DENORM: ; %bb.0: +; GFX9-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX9-DENORM-NEXT: v_sub_f32_e32 v1, v2, v1 +; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fsub_c_fadd_a_a_f32: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v2, -2.0, v1 +; GFX10-FLUSH-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: fsub_c_fadd_a_a_f32: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX10-DENORM-NEXT: v_sub_f32_e32 v1, v2, v1 +; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 @@ -593,22 +2684,126 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f32(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f32: -; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], -; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] - -; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] - -; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] - -; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] - -; SI: buffer_store_dword [[RESULT]] -; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fsub_fadd_a_a_c_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-FLUSH-LABEL: fsub_fadd_a_a_c_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s2, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: v_mad_f32 v2, v2, 2.0, -v3 +; SI-FLUSH-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-STRICT-LABEL: fsub_fadd_a_a_c_f32: +; SI-DENORM-FASTFMA-STRICT: ; %bb.0: +; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_sub_f32_e32 v2, v2, v3 +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: fsub_fadd_a_a_c_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e32 v2, v2, v3 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-CONTRACT-LABEL: fsub_fadd_a_a_c_f32: +; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0: +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, v2, 2.0, -v3 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm +; +; GFX9-FLUSH-LABEL: fsub_fadd_a_a_c_f32: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: v_mad_f32 v1, v1, 2.0, -v2 +; GFX9-FLUSH-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-FLUSH-NEXT: s_endpgm +; +; GFX9-DENORM-LABEL: fsub_fadd_a_a_c_f32: +; GFX9-DENORM: ; %bb.0: +; GFX9-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX9-DENORM-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fsub_fadd_a_a_c_f32: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mad_f32 v1, v1, 2.0, -v2 +; GFX10-FLUSH-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: fsub_fadd_a_a_c_f32: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX10-DENORM-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1