; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 -fp-contract=fast | FileCheck %s --check-prefixes=CHECK,FAST ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefixes=CHECK,DEFAULT ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -fp-contract=fast | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 | %ptxas-verify %} target triple = "nvptx64-unknown-cuda" ;; Make sure we are generating proper instruction sequences for fused ops ;; If fusion is allowed, we try to form fma.rn at the PTX level, and emit ;; add.f32 otherwise. Without an explicit rounding mode on add.f32, ptxas ;; is free to fuse with a multiply if it is able. If fusion is not allowed, ;; we do not form fma.rn at the PTX level and explicitly generate add.rn ;; for all adds to prevent ptxas from fusion the ops. define float @t0(float %a, float %b, float %c) { ; FAST-LABEL: t0( ; FAST: { ; FAST-NEXT: .reg .b32 %r<5>; ; FAST-EMPTY: ; FAST-NEXT: // %bb.0: ; FAST-NEXT: ld.param.b32 %r1, [t0_param_0]; ; FAST-NEXT: ld.param.b32 %r2, [t0_param_1]; ; FAST-NEXT: ld.param.b32 %r3, [t0_param_2]; ; FAST-NEXT: fma.rn.f32 %r4, %r1, %r2, %r3; ; FAST-NEXT: st.param.b32 [func_retval0], %r4; ; FAST-NEXT: ret; ; ; DEFAULT-LABEL: t0( ; DEFAULT: { ; DEFAULT-NEXT: .reg .b32 %r<6>; ; DEFAULT-EMPTY: ; DEFAULT-NEXT: // %bb.0: ; DEFAULT-NEXT: ld.param.b32 %r1, [t0_param_0]; ; DEFAULT-NEXT: ld.param.b32 %r2, [t0_param_1]; ; DEFAULT-NEXT: mul.rn.f32 %r3, %r1, %r2; ; DEFAULT-NEXT: ld.param.b32 %r4, [t0_param_2]; ; DEFAULT-NEXT: add.rn.f32 %r5, %r3, %r4; ; DEFAULT-NEXT: st.param.b32 [func_retval0], %r5; ; DEFAULT-NEXT: ret; %v0 = fmul float %a, %b %v1 = fadd float %v0, %c ret float %v1 } ;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32 ;; to prevent ptxas from fusing this with anything else. define float @t1(float %a, float %b) { ; FAST-LABEL: t1( ; FAST: { ; FAST-NEXT: .reg .b32 %r<6>; ; FAST-EMPTY: ; FAST-NEXT: // %bb.0: ; FAST-NEXT: ld.param.b32 %r1, [t1_param_0]; ; FAST-NEXT: ld.param.b32 %r2, [t1_param_1]; ; FAST-NEXT: add.f32 %r3, %r1, %r2; ; FAST-NEXT: sub.f32 %r4, %r1, %r2; ; FAST-NEXT: mul.f32 %r5, %r3, %r4; ; FAST-NEXT: st.param.b32 [func_retval0], %r5; ; FAST-NEXT: ret; ; ; DEFAULT-LABEL: t1( ; DEFAULT: { ; DEFAULT-NEXT: .reg .b32 %r<6>; ; DEFAULT-EMPTY: ; DEFAULT-NEXT: // %bb.0: ; DEFAULT-NEXT: ld.param.b32 %r1, [t1_param_0]; ; DEFAULT-NEXT: ld.param.b32 %r2, [t1_param_1]; ; DEFAULT-NEXT: add.rn.f32 %r3, %r1, %r2; ; DEFAULT-NEXT: sub.rn.f32 %r4, %r1, %r2; ; DEFAULT-NEXT: mul.rn.f32 %r5, %r3, %r4; ; DEFAULT-NEXT: st.param.b32 [func_retval0], %r5; ; DEFAULT-NEXT: ret; %v1 = fadd float %a, %b %v2 = fsub float %a, %b %v3 = fmul float %v1, %v2 ret float %v3 } ;; Make sure we generate the non ".rn" version when the "contract" flag is ;; present on the instructions define float @t2(float %a, float %b) { ; CHECK-LABEL: t2( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [t2_param_0]; ; CHECK-NEXT: ld.param.b32 %r2, [t2_param_1]; ; CHECK-NEXT: add.f32 %r3, %r1, %r2; ; CHECK-NEXT: sub.f32 %r4, %r1, %r2; ; CHECK-NEXT: mul.f32 %r5, %r3, %r4; ; CHECK-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-NEXT: ret; %v1 = fadd contract float %a, %b %v2 = fsub contract float %a, %b %v3 = fmul contract float %v1, %v2 ret float %v3 } ;; Make sure we always fold to fma when the "contract" flag is present define float @t3(float %a, float %b, float %c) { ; CHECK-LABEL: t3( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [t3_param_0]; ; CHECK-NEXT: ld.param.b32 %r2, [t3_param_1]; ; CHECK-NEXT: ld.param.b32 %r3, [t3_param_2]; ; CHECK-NEXT: fma.rn.f32 %r4, %r1, %r2, %r3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; %v0 = fmul contract float %a, %b %v1 = fadd contract float %v0, %c ret float %v1 }